File tree Expand file tree Collapse file tree 3 files changed +10
-32
lines changed
examples/quantization_kv_cache Expand file tree Collapse file tree 3 files changed +10
-32
lines changed Original file line number Diff line number Diff line change @@ -78,23 +78,13 @@ def process_and_tokenize(example):
7878 num_calibration_samples = NUM_CALIBRATION_SAMPLES ,
7979)
8080
81- print (
82- "Note: Inference with the quantized kv_cache is not supported. " ,
83- "Please use vLLM for inference with the quantized kv_cache." ,
84- )
8581# Confirm generations of the quantized model look sane.
86-
87- # NOTE: transformers 4.49.0 results in a generation error with gemma2.
88- # Consider either downgrading your transformers version to a previous version
89- # or use vLLM for sample generation.
90- # Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
9182print ("\n \n " )
92- dispatch_for_generation (model )
9383print ("========== SAMPLE GENERATION ==============" )
94- input_ids = tokenizer ( "Hello my name is" , return_tensors = "pt" ). input_ids . to (
95- model . device
96- )
97- output = model .generate (input_ids , max_new_tokens = 100 , disable_compile = True )
84+ dispatch_for_generation ( model )
85+ sample = tokenizer ( "Hello my name is" , return_tensors = "pt" )
86+ sample = { key : value . to ( model . device ) for key , value in sample . items ()}
87+ output = model .generate (** sample , max_new_tokens = 100 )
9888print (tokenizer .decode (output [0 ]))
9989print ("==========================================\n \n " )
10090
Original file line number Diff line number Diff line change 11from datasets import load_dataset
2- from loguru import logger
32from transformers import AutoModelForCausalLM , AutoTokenizer
43
54from llmcompressor import oneshot
@@ -79,19 +78,13 @@ def process_and_tokenize(example):
7978 num_calibration_samples = NUM_CALIBRATION_SAMPLES ,
8079)
8180
82- logger .info (
83- "Running sample generation. " ,
84- "Note: Inference with the quantized kv_cache is not supported. " ,
85- "Please use vLLM for inference with the quantized kv_cache." ,
86- )
8781# Confirm generations of the quantized model look sane.
8882print ("\n \n " )
8983print ("========== SAMPLE GENERATION ==============" )
9084dispatch_for_generation (model )
91- input_ids = tokenizer ("Hello my name is" , return_tensors = "pt" ).input_ids .to (
92- model .device
93- )
94- output = model .generate (input_ids , max_new_tokens = 100 )
85+ sample = tokenizer ("Hello my name is" , return_tensors = "pt" )
86+ sample = {key : value .to (model .device ) for key , value in sample .items ()}
87+ output = model .generate (** sample , max_new_tokens = 100 )
9588print (tokenizer .decode (output [0 ]))
9689print ("==========================================\n \n " )
9790
Original file line number Diff line number Diff line change @@ -80,18 +80,13 @@ def process_and_tokenize(example):
8080 num_calibration_samples = NUM_CALIBRATION_SAMPLES ,
8181)
8282
83- print (
84- "Note: Inference with the quantized kv_cache is not supported. " ,
85- "Please use vLLM for inference with the quantized kv_cache." ,
86- )
8783# Confirm generations of the quantized model look sane.
8884print ("\n \n " )
8985print ("========== SAMPLE GENERATION ==============" )
9086dispatch_for_generation (model )
91- input_ids = tokenizer ("Hello my name is" , return_tensors = "pt" ).input_ids .to (
92- model .device
93- )
94- output = model .generate (input_ids , max_new_tokens = 100 )
87+ sample = tokenizer ("Hello my name is" , return_tensors = "pt" )
88+ sample = {key : value .to (model .device ) for key , value in sample .items ()}
89+ output = model .generate (** sample , max_new_tokens = 100 )
9590print (tokenizer .decode (output [0 ]))
9691print ("==========================================\n \n " )
9792
You can’t perform that action at this time.
0 commit comments