Add MoE and Compressed Inference Examples (#160)

Sara Adkins · web-flow · commit 5bfb497bb407 · 2024-09-11T16:34:13.000-04:00
* add deepseek example

* renaming

* run_compressed_example

* update moe example

* extra comment
diff --git a/examples/compressed_inference/fp8_compressed_inference.py b/examples/compressed_inference/fp8_compressed_inference.py
@@ -0,0 +1,44 @@
+from transformers import AutoTokenizer
+
+from llmcompressor.transformers import SparseAutoModelForCausalLM
+
+"""
+This example covers how to load a quantized model in compressed mode. By default,
+SparseAutoModelForCausalLM will decompress the whole model on load resulting in no
+memory savings from quantization. By setting the `run_compressed` kwarg to True, the
+model will remain compressed in memory on load, saving memory during inference at the
+cost of increased runtime
+
+During inference, each layer will be decompressed as needed before the forward pass.
+This saves memory as only a single layer is ever uncompressed at a time, but increases
+runtime as we need to decompress each layer before running the forward pass
+
+"""
+
+# any model with the "compressed-tensors" quant_method and "compressed"
+# quantization_status in the quantization config is supported
+MODEL_STUB = "nm-testing/tinyllama-fp8-dynamic-compressed"
+
+SAMPLE_INPUT = [
+    "I love quantization because",
+    "What is the capital of France?",
+    "def fibonacci(n):",
+]
+
+# set run_compressed=True to enable running in compressed mode
+compressed_model = SparseAutoModelForCausalLM.from_pretrained(
+    MODEL_STUB, torch_dtype="auto", device_map="cuda:0", run_compressed=True
+)
+
+# tokenize the sample data
+tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB)
+inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
+    compressed_model.device
+)
+
+# run the compressed model and decode the output
+output = compressed_model.generate(**inputs, max_length=50)
+print("========== SAMPLE GENERATION ==============")
+text_output = tokenizer.batch_decode(output)
+for sample in text_output:
+    print(sample)
diff --git a/examples/quantizing_moe/README.md b/examples/quantizing_moe/README.md
@@ -27,7 +27,7 @@ This example leverages `llm-compressor` and `compressed-tensors` to create an FP
 You can follow the detailed steps below or simply run the example script with:
 
 ```bash
-python examples/quantizing_moe_fp8/mixtral_moe.py
+python examples/quantizing_moe/mixtral_moe_fp8.py
 ```
 
 ### Step 1: Select a Model, Dataset, and Recipe
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8.py b/examples/quantizing_moe/deepseek_moe_w8a8.py
@@ -0,0 +1,96 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
+from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+
+# adjust based off number of desired GPUs
+# if not enough memory is available, some layers will automatically be offlaoded to cpu
+device_map = calculate_offload_device_map(
+    MODEL_ID,
+    reserve_for_hessians=True,
+    num_gpus=2,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+)
+
+model = SparseAutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+# its recommended to use more calibration samples for MoE models so each expert is hit
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 2048
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for W416 quantization
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W8A8",
+        ignore=["lm_head", "re:.*mlp.gate$"],
+        sequential_update=True,
+    ),
+]
+
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    save_compressed=True,
+    output_dir=SAVE_DIR,
+)
+
+
+print("========== SAMPLE GENERATION ==============")
+SAMPLE_INPUT = ["I love quantization because"]
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
+output = model.generate(**inputs, max_length=50)
+text_output = tokenizer.batch_decode(output)
+print(text_output)
diff --git a/examples/quantizing_moe/mixtral_moe_fp8.py b/examples/quantizing_moe/mixtral_moe_fp8.py