diff --git a/google-gemma-Gemma3-4B/qnn/README.md b/google-gemma-Gemma3-4B/qnn/README.md new file mode 100644 index 00000000..2f7895dc --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/README.md @@ -0,0 +1,122 @@ +# Gemma-3-4B Model Optimization + +This repository demonstrates the optimization of the [Google Gemma-3-4B](https://huggingface.co/google/gemma-3-4b-it) model using **post-training quantization (PTQ)** techniques for QNN (Qualcomm Neural Network) execution. The optimization process utilizes an environment based heavily upon the [PTQ tutorial for Phi-3.5](https://github.com/CodeLinaro/olive-recipes/blob/main/microsoft-Phi-3.5-mini-instruct/aitk/README.md) + +## File Overview + +This example contains the following key files: + +- **`env_setup.sh`** - Automated environment setup script (Linux only) +- **`gemma3-4b-text-qnn-config.json`** - Olive configuration for optimizing the text component +- **`gemma3-4b-vision-qnn-config.json`** - Olive configuration for optimizing the vision component +- **`user_script.py`** - Dataset handling and preprocessing utilities +- **`custom_gemma3_4b_it_vision.py`** - Vision model loader for the optimization pipeline + +## Prerequisites + +### System Requirements +- **Operating System**: Linux (automated setup script is Linux-only) +- **Python**: 3.10 +- **Package Manager**: [uv](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) +- **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically) + +### Dependencies Installed by Setup Script +The `env_setup.sh` script installs the following components: +- setuptools (for building Olive from source) +- Olive requirements and dependencies +- AutoGPTQ (from source) +- GPTQModel (specific commit: `558449bed3ef2653c36041650d30da6bbbca440d`) +- onnxruntime-qnn (pre-release version) + +## Setup Instructions + +### Automated Setup (Recommended) +```bash +source env_setup.sh +``` + +### Manual Setup (Alternative) +If you prefer to set up manually or need to troubleshoot: + +1. Install setuptools: + ```bash + uv pip install setuptools + ``` + +2. Install requirements: + ```bash + uv pip install -r ../requirements.txt + uv pip install -r ../../../requirements.txt + ``` + +3. Install AutoGPTQ from source: + ```bash + export BUILD_CUDA_EXT=0 + uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git + ``` + +4. Install GPTQModel with Gemma3 fix: + ```bash + uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d + ``` + +5. Install onnxruntime-qnn: + ```bash + uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt + uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps + ``` + +> **Important:** The setup uses a specific commit hash for GPTQModel (`558449bed3ef2653c36041650d30da6bbbca440d`) to address a [memory leak issue](https://github.com/ModelCloud/GPTQModel/commit/558449bed3ef2653c36041650d30da6bbbca440d) with Gemma3 models. + +## Optimization Process + +Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work together at the onnxruntime-genai stage. + +### Configuration Differences + +**Text Configuration (`gemma3-4b-text-qnn-config.json`)**: +- Uses HuggingFace model directly (`google/gemma-3-4b-it`) +- Applies comprehensive optimization pipeline: QuaRot → GptqModel → ModelBuilder → Quantization +- Outputs to: `models/gemma-3-4b-it-text/` + +**Vision Configuration (`gemma3-4b-vision-qnn-config.json`)**: +- Uses custom PyTorch model loader (`custom_gemma3_4b_it_vision.py`) +- Simpler pipeline: ONNX Conversion → Graph Surgery → Quantization +- Outputs to: `models/gemma-3-4b-it-vision/` + +### Running Optimization + +Execute the following commands to separately produce optimized binaries for each component: + +```bash +olive run --config gemma3-4b-text-qnn-config.json +``` + +```bash +olive run --config gemma3-4b-vision-qnn-config.json +``` + +## Expected Outputs + +After successful optimization, you will find: + +- **Text model outputs**: `models/gemma-3-4b-it-text/` +- **Vision model outputs**: `models/gemma-3-4b-it-vision/` +- **Cache directory**: `cache/` (intermediate files and downloaded datasets) +- **Dataset**: `.cache/train2017/` (COCO train2017 images, ~13GB) + +Both configurations use `"no_artifacts": true`, meaning only the final optimized models are retained. + +## Troubleshooting + +### Common Issues + +**Insufficient Storage**: The COCO train2017 dataset requires ~13GB of storage and is downloaded automatically to `.cache/train2017/`. + +**Memory Requirements**: The optimization process, particularly for the text model with its comprehensive pipeline, requires substantial memory. + +**QNN Provider**: Ensure the QNNExecutionProvider is properly installed and configured in your environment. + +**Platform Limitation**: The current setup script is designed for Linux only. Windows/macOS users will need to adapt the manual setup steps. + +**Dataset Download**: If the COCO dataset download fails, check your internet connection and available storage. The script uses `wget` which must be available on your system. diff --git a/google-gemma-Gemma3-4B/qnn/custom_gemma3_4b_datasets.py b/google-gemma-Gemma3-4B/qnn/custom_gemma3_4b_datasets.py new file mode 100644 index 00000000..77751530 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/custom_gemma3_4b_datasets.py @@ -0,0 +1,526 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import copy +import logging +import os +import subprocess +import zipfile +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional + +import torch +from datasets import load_dataset +from huggingface_hub import hf_hub_download +from PIL import Image as PILImage +from transformers import ( + AutoModel, + AutoProcessor, + AutoTokenizer, +) + +from olive.data.registry import Registry + +logger = logging.getLogger(__name__) + + +class BaseGemmaDataset(ABC): + """Abstract base class for Gemma dataset implementations.""" + + CACHE_DIR = os.getenv("CACHE_DIR", ".cache") + + def __init__(self, model_id: str, first_n: Optional[int] = None): + self.model_id = model_id + self.first_n = first_n + self.processor = AutoProcessor.from_pretrained(self.model_id) + + # Initialize attributes that will be set during dataset loading + self.image_data_path = None + self.raw_datasets = None + + # Initialize processor components based on subclass requirements + self._initialize_processor_components() + + self.setup_dataset() + + @abstractmethod + def _initialize_processor_components(self): + """Initialize processor components specific to the dataset type.""" + + @abstractmethod + def _process_dataset_entry(self, entry: dict[str, any]): + """Process a single dataset entry according to the dataset type.""" + + def _convert_single_llava_to_gemma_conversation( + self, conversation: list[dict[str, str]], strip_images: bool = False + ) -> dict[str, str | list[dict]]: + """Convert a single llava-style conversation entry to Gemma-style. + + Args: + conversation: The conversation entry to convert + strip_images: If True, remove tokens and create text-only content. + If False, preserve tokens and create multimodal content. + + Examples: + >>> conversation = {"from": "human", "value": "What are the colors of the bus in the image?"} + >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=False) + { + 'role': 'user', + 'content': [{'type': 'image'}, {'type': 'text', 'text': 'What are the colors of the bus in the image?'}] + } + >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=True) + { + 'role': 'user', + 'content': [{'type': 'text', 'text': 'What are the colors of the bus in the image?'}] + } + + """ + who = conversation.get("from") + match who: + case "human": + role = "user" + case "gpt": + role = "assistant" + case _: + raise ValueError(f"Unknown role: {who}") + + text = conversation.get("value") + + if strip_images: + # Text-only: remove image references completely + text = text.replace("", "").strip() + return { + "role": role, + "content": [{"type": "text", "text": text}], + } + else: + # Multimodal: preserve image references + if "" in text: + has_image = True + text = text.replace("", "") + else: + has_image = False + + return { + "role": role, + "content": ( + [{"type": "image"}, {"type": "text", "text": text}] + if has_image + else [{"type": "text", "text": text}] + ), + } + + def _convert_llava_to_gemma_conversation(self, entry: dict[str, any], strip_images: bool = False): + """Convert LlaVA-style conversations to Gemma-style.""" + entry["text"] = [ + self._convert_single_llava_to_gemma_conversation(conversation, strip_images=strip_images) + for conversation in entry["conversations"] + ] + del entry["conversations"] + return entry + + def _download_and_extract_images(self): + """Download the COCO train2017 image dataset and extract to the cache directory.""" + zip_filename = "train2017.zip" + zip_path = os.path.join(self.CACHE_DIR, zip_filename) + extract_path = os.path.join(self.CACHE_DIR, "train2017") + + # Create cache directory if it doesn't exist + os.makedirs(self.CACHE_DIR, exist_ok=True) + + # Check if images are already downloaded and extracted + extract_path_obj = Path(extract_path) + if extract_path_obj.exists() and any(extract_path_obj.iterdir()): + logger.info("Images already exist at %s", extract_path) + return extract_path + + # Download the dataset if zip doesn't exist + if not os.path.exists(zip_path): + logger.info("Downloading COCO train2017 dataset to %s", zip_path) + try: + subprocess.run( + [ + "wget", + "https://images.cocodataset.org/zips/train2017.zip", + "--no-check-certificate", + "-O", + zip_path, + ], + check=True, + ) + logger.info("Download completed successfully") + except subprocess.CalledProcessError: + logger.exception("Failed to download dataset") + raise + except FileNotFoundError: + logger.exception("wget command not found. Please install wget or use an alternative download method.") + raise + + # Extract the zip file + logger.info("Extracting %s to %s", zip_path, self.CACHE_DIR) + try: + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(self.CACHE_DIR) + logger.info("Extraction completed successfully") + except zipfile.BadZipFile: + logger.exception("Failed to extract zip file") + # Remove corrupted zip file so it can be re-downloaded + if os.path.exists(zip_path): + os.remove(zip_path) + raise + + return extract_path + + def _load_base_dataset(self): + """Load the base LlaVA dataset.""" + # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K + file_path = hf_hub_download( + repo_id="liuhaotian/LLaVA-Instruct-150K", + filename="llava_instruct_80k.json", + repo_type="dataset", + cache_dir=self.CACHE_DIR, + ) + + self.image_data_path = self._download_and_extract_images() + self.raw_datasets = load_dataset("json", data_files=[file_path], split="train") + + # Limit data processing to the first_n rows + self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n)) + + def _extract_image_details(self, entry: dict[str, any]): + """Extract image details from the dataset example. + + Opens the image file and adds image mode information to the example. + """ + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"])) + entry["image_mode"] = image.mode + return entry + + def setup_dataset(self): + """Set up the dataset with common preprocessing steps.""" + self._load_base_dataset() + + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply dataset-specific processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def get_dataset(self): + """Return the processed dataset.""" + return self.raw_datasets + + +class GemmaMultimodalDataset(BaseGemmaDataset): + """Dataset for full E2E Gemma 3 multi-modal model including both image and text.""" + + def _initialize_processor_components(self): + """Initialize tokenizer for multimodal processing.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def setup_dataset(self): + """Set up the multimodal dataset with text conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (preserve images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) + ) + + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply multimodal processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def _process_dataset_entry(self, entry: dict[str, any]): + """Load image and tokenize the conversation for model input. + + Args: + entry: Dataset entry containing text conversation and image path + + Returns: + Tokenized inputs ready for model processing + + """ + return self.processor.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + + +class GemmaTextOnlyDataset(BaseGemmaDataset): + """Dataset for only the text portion of the Gemma 3 model.""" + + def _initialize_processor_components(self): + """Initialize tokenizer for text-only processing.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def setup_dataset(self): + """Set up the text-only dataset with conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (strip images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=True) + ) + + # Extract image details (still needed for filtering) + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply text-only processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def _process_dataset_entry(self, entry: dict[str, any]): + """Extract and tokenize only the text content. + + Args: + entry: Dataset entry containing text conversation + + Returns: + Tokenized text inputs ready for model processing + + """ + # Apply chat template without images, text-only + inputs = self.tokenizer.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + return {k: v.squeeze(0) for k, v in inputs.items()} # Remove batch dimension + + +class GemmaImageDataset(BaseGemmaDataset): + """Dataset for only the image processing of the Gemma 3 model.""" + + def _initialize_processor_components(self): + """No additional components needed for image-only processing.""" + + def _process_dataset_entry(self, entry: dict[str, any]): + """Load image and extract only pixel_values for image-only processing.""" + # Load and process the image + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) + + # Process image to get pixel_values + inputs = self.processor(text="", images=image, return_tensors="pt") + + # Return only pixel_values + return {"pixel_values": inputs["pixel_values"]} + + +class GemmaEmbeddingInputDataset(BaseGemmaDataset): + """Dataset that is the input to the embedding layer.""" + + def __init__(self, model_id, first_n=None): + # Initialize lazy-loaded model components + self._vision_tower = None + self._multi_modal_projector = None + + super().__init__(model_id, first_n) + + def _initialize_processor_components(self): + """Initialize only standard processor components.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def _get_vision_components(self): + """Lazy-load vision model components when first needed.""" + if self._vision_tower is None: + logger.info("Loading vision model components for cached embedding dataset") + full_model = AutoModel.from_pretrained(self.model_id) + + # Extract vision components (equivalent to Gemma3VisualEmbeddingGenerator) + self._vision_tower = full_model.vision_tower + self._multi_modal_projector = full_model.multi_modal_projector + + # Clean up full model to save memory + del full_model.language_model + + return self._vision_tower.cuda(), self._multi_modal_projector.cuda() + + def setup_dataset(self): + """Set up the multimodal dataset with text conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (preserve images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) + ) + + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply multimodal processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def _process_dataset_entry(self, entry: dict[str, any]): + """Process entry to return input_ids and cached image features.""" + # Convert conversation and tokenize + inputs = self.processor.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + + # Load and process image + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) + pixel_values = torch.tensor(self.processor(text="", images=image).pixel_values) + + # Get vision components and extract features + vision_tower, projector = self._get_vision_components() + pixel_values = pixel_values.to(device="cuda") + + with torch.no_grad(): + # Process through vision tower + image_outputs = vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_outputs.last_hidden_state + # Project to final embedding space + image_features = projector(selected_image_feature) + # Convert to numpy for caching + image_features = image_features.cpu().detach().numpy() + + return {"input_ids": inputs["input_ids"], "image_features": image_features} + + +class GemmaEmbeddingDataset(BaseGemmaDataset): + """Dataset that pre-merges text and image embeddings.""" + + def __init__(self, model_id, first_n=None): + # Initialize lazy-loaded model components + self._vision_tower = None + self._multi_modal_projector = None + self._embedding_layer = None + + super().__init__(model_id, first_n) + + def _initialize_processor_components(self): + """Initialize only standard processor components.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def _get_model_components(self): + """Lazy-load all required model components when first needed.""" + if self._embedding_layer is None: + logger.info("Loading model components for merged embedding dataset") + full_model = AutoModel.from_pretrained(self.model_id) + + # Extract components + self._vision_tower = full_model.vision_tower.cuda() + self._multi_modal_projector = full_model.multi_modal_projector.cuda() + self._embedding_layer = copy.deepcopy(full_model.language_model.embed_tokens).cuda() + + # Clean up full model + del full_model.language_model + + return self._vision_tower, self._multi_modal_projector, self._embedding_layer + + def _merge_embeddings(self, input_ids: torch.Tensor, pixel_values: torch.Tensor): + """Merge text and image embeddings at special token positions.""" + vision_tower, projector, embedding_layer = self._get_model_components() + + # Get text embeddings + inputs_embeds = embedding_layer(input_ids.to(device="cuda")) + + # Process image + pixel_values = pixel_values.to(dtype=inputs_embeds.dtype, device="cuda") + with torch.no_grad(): + image_outputs = vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_outputs.last_hidden_state + image_features = projector(selected_image_feature) + + # Merge at special token positions (image_token_index = 262144) + image_token_index = 262144 + special_image_mask = (input_ids == image_token_index).unsqueeze(-1) + special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) + + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + return inputs_embeds.masked_scatter(special_image_mask, image_features) + + def setup_dataset(self): + """Set up the multimodal dataset with text conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (preserve images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) + ) + + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply multimodal processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def _process_dataset_entry(self, entry: dict[str, any]): + """Process entry to return merged embeddings.""" + # Convert conversation and tokenize + inputs = self.processor.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + + # Load and process image + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) + pixel_values = torch.tensor(self.processor(text="", images=image).pixel_values) + + # Merge embeddings + inputs_embeds = self._merge_embeddings(inputs["input_ids"], pixel_values) + + return { + "input_ids": inputs["input_ids"], + "inputs_embeds": inputs_embeds, + "attention_mask": inputs["attention_mask"].squeeze(0), + } + + +# Remove this when submitting for review +TEXT_SHORTCUT_FIRST_N = 600 +SHORTCUT_FIRST_N = 200 + + +@Registry.register_dataset() +def gemma_dataset(model_id: str): + """Full E2E Gemma 3 multi-modal dataset (image + text).""" + return GemmaMultimodalDataset(model_id, first_n=TEXT_SHORTCUT_FIRST_N).get_dataset() + + +@Registry.register_dataset() +def gemma_text_dataset(model_id: str): + """Text-only Gemma 3 dataset.""" + return GemmaTextOnlyDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() + + +@Registry.register_dataset() +def gemma_image_dataset(model_id: str): + """Image-only Gemma 3 dataset.""" + return GemmaImageDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() + + +@Registry.register_dataset() +def gemma_embedding_input_dataset(model_id: str): + """Gemma 3 dataset with embedding layer input.""" + return GemmaEmbeddingInputDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() + + +@Registry.register_dataset() +def gemma_embedding_dataset(model_id: str): + """Gemma 3 dataset with pre-merged text and image embeddings.""" + return GemmaEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() diff --git a/google-gemma-Gemma3-4B/qnn/custom_gemma3_4b_embedding.py b/google-gemma-Gemma3-4B/qnn/custom_gemma3_4b_embedding.py new file mode 100644 index 00000000..97c9cf2e --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/custom_gemma3_4b_embedding.py @@ -0,0 +1,37 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + + +import logging + +import torch +from transformers import AutoModel + +logger = logging.getLogger(__name__) + + +class EmbeddingLayer(torch.nn.Module): + def __init__(self, full_model): + super().__init__() + self.embedding_layer = full_model.language_model.embed_tokens + + def forward(self, input_ids, image_features): + image_token_index = 262144 + inputs_embeds = self.embedding_layer(input_ids) + + special_image_mask = (input_ids == image_token_index).unsqueeze(-1) + special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + return inputs_embeds.masked_scatter(special_image_mask, image_features) + + +def load_gemma3_embedding_model(model_path): + full_model = AutoModel.from_pretrained("google/gemma-3-4b-it") + logger.info("Loaded full model: %s", full_model) + + embedding_layer = EmbeddingLayer(full_model) + + logger.info("Created embedding-only model: %s", embedding_layer) + return embedding_layer diff --git a/google-gemma-Gemma3-4B/qnn/custom_gemma3_4b_vision.py b/google-gemma-Gemma3-4B/qnn/custom_gemma3_4b_vision.py new file mode 100644 index 00000000..1eb7f8f3 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/custom_gemma3_4b_vision.py @@ -0,0 +1,36 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + + +import logging + +import torch +from transformers import AutoModel + +logger = logging.getLogger(__name__) + + +class Gemma3VisualEmbeddingGenerator(torch.nn.Module): + def __init__(self, full_model): + super().__init__() + # Extract only the vision components + self.vision_tower = full_model.vision_tower + self.multi_modal_projector = full_model.multi_modal_projector + + def forward(self, pixel_values): + # Process images through vision tower + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_outputs.last_hidden_state + # Project to final embedding space + return self.multi_modal_projector(selected_image_feature) + + +def load_gemma3_vision_model(model_path): + full_model = AutoModel.from_pretrained("google/gemma-3-4b-it") + logger.info("Loaded full model: %s", full_model) + + vision_model = Gemma3VisualEmbeddingGenerator(full_model) + logger.info("Created vision-only model: %s", vision_model) + return vision_model diff --git a/google-gemma-Gemma3-4B/qnn/env_setup.sh b/google-gemma-Gemma3-4B/qnn/env_setup.sh new file mode 100644 index 00000000..aa117afc --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/env_setup.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +# Installing setuptools to build Olive from source +uv pip install setuptools + +# Requires installation of uv +uv pip install -r ../requirements.txt + +# Require installation of Olive dependencies +uv pip install -r ../../../requirements.txt + +# Disable CUDA extension build +export BUILD_CUDA_EXT=0 + +# Install AutoGPTQ from source +uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git + +# Install GptqModel from source +# Note: Commit hash corresponds to commit which fixes Gemma 3 memory leak issue. See README.md for additional details. +uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d + +# Install onnxruntime-qnn without installing onnxruntime +uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt +uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps diff --git a/google-gemma-Gemma3-4B/qnn/gemma-3-4b.ipynb b/google-gemma-Gemma3-4B/qnn/gemma-3-4b.ipynb new file mode 100644 index 00000000..16724f77 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/gemma-3-4b.ipynb @@ -0,0 +1,399 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Gemma 3 4B QNN model conversion with Olive \n", + "### Task: Text + Vision Generation 📝\n", + "\n", + "In this notebook, you'll:\n", + "- Download the required datasets\n", + "- Convert LLM to QNN format\n", + "- Convert Vision to QNN format\n", + "- Convert Embedding layer with image to QNN format\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Platform requirements\n", + "This notebook is intended to run on a machine with:\n", + " * **Operating System**: Linux Ubuntu 22.04 (automated setup script is Linux-only)\n", + " * **Python**: 3.10\n", + " * NVIDIA driver version equivalent to 525.60.13\n", + " * NVIDIA A100 GPU\n", + " * **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🐍 Python Virtual environments\n", + "Creates Olive and QNN python virtual environments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!git clone https://github.com/CodeLinaro/Olive.git -b dev/qti-kromero/gemma3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import venv\n", + "from pathlib import Path\n", + "import subprocess\n", + "import json\n", + "import shutil\n", + "import urllib.request\n", + "import onnx\n", + "from onnx import helper, TensorProto\n", + "import glob\n", + "\n", + "current_dir = os.getcwd()\n", + "MODEL=\"google/gemma-3-4b-it\"\n", + "OLIVE_PYTHON_PATH = './olive_venv'\n", + "OLIVE_PYTHON_BIN = './olive_venv/bin/python'\n", + "olive_pip_path = Path(OLIVE_PYTHON_PATH) / \"bin\" / \"pip\"\n", + "OLIVE_REPO_PATH = Path(\"./Olive\")\n", + "OLIVE_REQ = \"./requirements.txt\"\n", + "QNN_REQ = \"./qnn_req.txt\"\n", + "\n", + "QNN_PYTHON_PATH = './qnn_venv'\n", + "QNN_PYTHON_BIN_PATH = './qnn_venv/bin'\n", + "qnn_pip_path = Path(QNN_PYTHON_PATH) / \"bin\" / \"pip\"\n", + "QNN_PYTHON_BIN_FULL_PATH = f\"{current_dir}/{QNN_PYTHON_BIN_PATH}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Olive Python Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "if not os.path.exists(OLIVE_PYTHON_PATH):\n", + " print(\"Creating Olive Venv\")\n", + " builder = venv.EnvBuilder(with_pip=True)\n", + " builder.create(Path(OLIVE_PYTHON_PATH))\n", + "my_env = os.environ.copy()\n", + "my_env[\"BUILD_CUDA_EXT\"] = \"0\"\n", + "GPTQ=\"git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d\"\n", + "subprocess.check_call([str(olive_pip_path), \"install\", \"-U\", \"-r\" , OLIVE_REQ], env=my_env)\n", + "subprocess.check_call([str(olive_pip_path), \"install\", \"--no-build-isolation\", GPTQ], env=my_env)\n", + "subprocess.check_call([str(olive_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare QNN Python Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "if not os.path.exists(QNN_PYTHON_PATH):\n", + " print(\"Creating QNN Venv\")\n", + " builder = venv.EnvBuilder(with_pip=True)\n", + " builder.create(Path(QNN_PYTHON_PATH))\n", + "subprocess.check_call([str(qnn_pip_path), \"install\", \"--no-build-isolation\", \"-r\" , QNN_REQ], env=my_env)\n", + "subprocess.check_call([str(qnn_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])\n", + "subprocess.check_call([str(qnn_pip_path), \"install\", \"-U\", \"--pre\", \"--extra-index-url\",\n", + " \"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple\",\n", + " \"onnxruntime-qnn==1.23.0.dev20250815002\", \"--no-deps\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 🤗 Login to Hugging Face\n", + "To access models, you'll need to log-in to Hugging Face with a [user access token](https://huggingface.co/docs/hub/security-tokens). The following command will run you through the steps to login:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli login --token <>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Apply few patches to Onnxruntime and GPTQModel\n", + "\n", + "This is needed for running the Olive recipies for this model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!patch ./olive_venv/lib/python3.10/site-packages/gptqmodel/utils/model.py < gptqmodel_int8.patch\n", + "!patch ./olive_venv/lib/python3.10/site-packages/onnxruntime_genai/models/builder.py < oga_patch1.patch\n", + "!patch ./olive_venv/lib/python3.10/site-packages/onnxruntime_genai/models/quantized_model.py < oga_patch2.patch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "base_url = \"https://raw.githubusercontent.com/CodeLinaro/onnxruntime/326d9d30129bbad698e0306d24dcea0ec5a19e60\"\n", + "urls = [\n", + " base_url + \"/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py\",\n", + " base_url + \"/onnxruntime/python/tools/quantization/quant_utils.py\"\n", + "]\n", + "\n", + "destinations = [\n", + " OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py\",\n", + " OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/quant_utils.py\"\n", + "]\n", + "\n", + "for url, dest in zip(urls, destinations):\n", + " urllib.request.urlretrieve(url, dest)\n", + " print(f\"Downloaded and replaced: {dest}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Olive Recipes" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**GPU utilization observed during the run**\n", + "\n", + "\t\ta. Text GPTQModel quantization: 12gb\n", + "\t\tb. Text Onnx static quantization: 41gb\n", + "\t\tc. Vision Onnx static quantization: 68gb\n", + " d. Embedding Onnx static quantization: 3gb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Clean Context binary directories if they exist\n", + "def clean_directory(path):\n", + " if os.path.exists(path):\n", + " for file in glob.glob(os.path.join(path, '*')):\n", + " if os.path.isfile(file):\n", + " os.remove(file)\n", + "dirs_to_clean = [\n", + " './models/gemma3_qnn/model/',\n", + " './models/gemma-3-4b-it-vision/model/',\n", + " './models/gemma-3-4b-it-embed/model/'\n", + "]\n", + "\n", + "for dir_path in dirs_to_clean:\n", + " clean_directory(dir_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1️⃣ LLM model generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config_path = Path(f\"./gemma3-4b-text-qnn-config.json\")\n", + "with open(config_path, \"r\") as file:\n", + " data = json.load(file)\n", + "\n", + "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n", + "data[\"input_model\"][\"model_path\"] = MODEL\n", + "\n", + "with open(config_path, \"w\") as file:\n", + " json.dump(data, file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!./olive_venv/bin/olive run --config ./gemma3-4b-text-qnn-config.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2️⃣ Vision model Quantization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config_path = Path(f\"./gemma3-4b-vision-qnn-config.json\")\n", + "with open(config_path, \"r\") as file:\n", + " data = json.load(file)\n", + "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n", + "\n", + "with open(config_path, \"w\") as file:\n", + " json.dump(data, file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!./olive_venv/bin/olive run --config ./gemma3-4b-vision-qnn-config.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3️⃣ Embedding Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!./olive_venv/bin/olive run --config ./gemma3-4b-embedding-qnn-config.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Keep output of the embedding model as uint16 instead of float" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = onnx.load(\"./models/gemma-3-4b-it-embed/model/model.onnx\")\n", + "graph = model.graph\n", + "\n", + "last_node = graph.node[-1]\n", + "graph.node.remove(last_node)\n", + "previous_node_output = graph.node[-1].output[0]\n", + "new_output = helper.make_tensor_value_info(\n", + " name=previous_node_output,\n", + " elem_type=TensorProto.UINT16,\n", + " shape=[\"batch_size\", \"seq_length\", 2560]\n", + ")\n", + "graph.output.remove(graph.output[0])\n", + "graph.output.extend([new_output])\n", + "onnx.save(model, \"./models/gemma-3-4b-it-embed/model/embeddings_with_image.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare final ORT GenAI folder for on-device inference " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!cp ./models/gemma-3-4b-it-embed/model/embeddings_with_image.onnx ./models/gemma3_qnn/model/\n", + "!cp ./models/gemma-3-4b-it-vision/model/model_ctx.onnx ./models/gemma3_qnn/model/model_ctx_vision.onnx \n", + "!cp ./models/gemma-3-4b-it-vision/model/model_ctx_qnn.bin ./models/gemma3_qnn/model/model_ctx_qnn.bin \n", + "!cp ./genai/*.* ./models/gemma3_qnn/model/\n", + "!ls -al ./models/gemma3_qnn/model/\n", + "\n", + "print(\"ORT GenAI inference setup: ./models/gemma3_qnn\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/google-gemma-Gemma3-4B/qnn/gemma3-4b-embedding-qnn-config.json b/google-gemma-Gemma3-4B/qnn/gemma3-4b-embedding-qnn-config.json new file mode 100644 index 00000000..06178224 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/gemma3-4b-embedding-qnn-config.json @@ -0,0 +1,51 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_script": "custom_gemma3_4b_embedding.py", + "model_loader": "load_gemma3_embedding_model", + "io_config": { + "input_names": [ "input_ids", "image_features" ], + "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ], + "input_types": [ "int64", "float32" ], + "output_names": [ "/model/embed_tokens/Mul_output_cast_0" ], + "output_shapes": [ [ 1, 64, 2560 ] ], + "dynamic_axes": { + "input_ids": { "0": "batch_size", "1": "seq_length" }, + "image_features": { "0": "batch_size", "1": "image_tokens_length" } + } + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "cpu", "execution_providers": [ "CPUExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "gemma_embedding_data_config", + "user_script": "custom_gemma3_4b_datasets.py", + "load_dataset_config": { "type": "gemma_embedding_input_dataset", "model_id": "google/gemma-3-4b-it" } + } + ], + "passes": { + "conversion": { "type": "OnnxConversion", "target_opset": 20 }, + "quantization": { + "type": "OnnxStaticQuantization", + "quant_preprocess": false, + "data_config": "gemma_embedding_data_config", + "activation_type": "uint16", + "precision": "uint8", + "calibrate_method": "MinMax", + "calibration_providers": [ "CUDAExecutionProvider" ], + "per_channel": true, + "weight_symmetric": true + }, + "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-embedding" } + }, + "target": "local_system", + "log_severity_level": 1, + "output_dir": "models/gemma-3-4b-it-embed", + "cache_dir": "cache-embd", + "no_artifacts": true +} diff --git a/google-gemma-Gemma3-4B/qnn/gemma3-4b-text-qnn-config.json b/google-gemma-Gemma3-4B/qnn/gemma3-4b-text-qnn-config.json new file mode 100644 index 00000000..ff1952f5 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/gemma3-4b-text-qnn-config.json @@ -0,0 +1,180 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google/gemma-3-4b-it", + "custom_task_class_name": "Gemma3ForCausalLM", + "custom_task_class_module": "transformers" + }, + "systems": { + "qnn_system": { + "type": "PythonEnvironment", + "python_environment_path": "", + "accelerators": [ + { + "execution_providers": [ + "QNNExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "wikitext2_train_joined", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "train" + }, + "pre_process_data_config": { + "strategy": "join", + "add_special_tokens": false, + "max_seq_len": 4096, + "max_samples": 128 + } + }, + { + "name": "wikitext2_train_act", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "train" + }, + "pre_process_data_config": { + "strategy": "line-by-line", + "add_special_tokens": true, + "max_samples": 200, + "max_seq_len": 4096 + } + } + ], + "passes": { + "cs": { + "type": "CaptureSplitInfo", + "num_splits": 2, + "unique_embeds_lm_head_splits": true + }, + "g": { + "type": "GptqModel", + "bits": 4, + "sym": true, + "group_size": -1, + "lm_head": false, + "device": "cuda", + "data_config": "wikitext2_train_joined", + "dynamic": { + "+:.*v_proj*": { + "bits": 8, + "sym": true, + "group_size": -1, + "desc_act": true + }, + "+:.*k_proj*": { + "bits": 8, + "sym": true, + "group_size": -1, + "desc_act": true + }, + "+:.*q_proj*": { + "bits": 8, + "sym": true, + "group_size": -1, + "desc_act": true + } + } + }, + "mb": { + "type": "ModelBuilder", + "precision": "int4", + "int4_block_size": 16, + "int4_accuracy_level": 4, + "int4_op_types_to_quantize": ["Gather", "MatMul"] + }, + "mq": { + "type": "MatMulNBitsToQDQ", + "use_int4": true, + "add_zero_point": true, + "nodes_to_exclude": [ + "/lm_head/MatMul_Q4" + ], + "save_as_external_data": true + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "RemoveRopeMultiCache" + }, + { + "surgeon": "AttentionMaskToSequenceLengths" + }, + { + "surgeon": "SimplifiedLayerNormToL2Norm" + } + ], + "save_as_external_data": true + }, + "f16": { + "type": "OnnxFloatToFloat16", + "op_include_list": [ + "GroupQueryAttention" + ], + "keep_io_types": [ + "logits" + ], + "save_as_external_data": true + }, + "sq": { + "type": "OnnxStaticQuantization", + "data_config": "wikitext2_train_act", + "activation_type": "uint16", + "precision": "uint8", + "calibration_providers": [ + "CUDAExecutionProvider" + ], + "quant_preprocess": true, + "op_types_to_exclude": [ + "GatherBlockQuantized", + "GroupQueryAttention", + "MatMulNBits" + ], + "save_as_external_data": true, + "extra_option": { + "CalibStridedMinMax": 4 + } + }, + "sp": { + "type": "SplitModel" + }, + "st": { + "type": "StaticLLM", + "batch_size": 1, + "context_length": 64 + }, + "cb": { + "type": "EPContextBinaryGenerator", + "provider_options": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "vtcm_mb": "8", + "htp_arch": "v73", + "soc_model": "60" + }, + "session_options": { + "intra_op_num_threads": 2, + "inter_op_num_threads": 1 + }, + "weight_sharing": true + }, + "cp": { + "type": "ComposeOnnxModels" + } + }, + "target": "qnn_system", + "log_severity_level": 0, + "output_dir": "models/gemma3_qnn", + "cache_dir": "cache", + "no_artifacts": true +} \ No newline at end of file diff --git a/google-gemma-Gemma3-4B/qnn/gemma3-4b-vision-qnn-config.json b/google-gemma-Gemma3-4B/qnn/gemma3-4b-vision-qnn-config.json new file mode 100644 index 00000000..e252381a --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/gemma3-4b-vision-qnn-config.json @@ -0,0 +1,102 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_script": "custom_gemma3_4b_vision.py", + "model_loader": "load_gemma3_vision_model", + "io_config": { + "input_names": [ + "pixel_values" + ], + "input_shapes": [ + [ + 1, + 3, + 896, + 896 + ] + ], + "input_types": [ + "float32" + ], + "output_names": [ + "image_features" + ], + "output_shapes": [ + [ + 1, + 256, + 2560 + ] + ] + } + }, + "systems": { + "qnn_system": { + "type": "PythonEnvironment", + "python_environment_path": "", + "accelerators": [ + { + "execution_providers": [ + "QNNExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "gemma_vision_data_config", + "user_script": "custom_gemma3_4b_datasets.py", + "load_dataset_config": { + "type": "gemma_image_dataset", + "model_id": "google/gemma-3-4b-it" + } + } + ], + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20 + }, + "surgery": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "MatMulAddToGemm" + } + ] + }, + "quantization": { + "type": "OnnxStaticQuantization", + "quant_preprocess": true, + "data_config": "gemma_vision_data_config", + "activation_type": "uint16", + "precision": "uint8", + "calibrate_method": "MinMax", + "calibration_providers": [ + "CUDAExecutionProvider" + ], + "per_channel": true, + "weight_symmetric": true + }, + "cb": { + "type": "EPContextBinaryGenerator", + "provider_options": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "vtcm_mb": "8", + "htp_arch": "v73", + "soc_model": "60" + } + }, + "add_metadata": { + "type": "AddOliveMetadata", + "graph_name": "gemma-3-4b-it-vision" + } + }, + "target": "qnn_system", + "log_severity_level": 1, + "output_dir": "models/gemma-3-4b-it-vision", + "cache_dir": "cache-vision", + "no_artifacts": true +} \ No newline at end of file diff --git a/google-gemma-Gemma3-4B/qnn/genai/app.py b/google-gemma-Gemma3-4B/qnn/genai/app.py new file mode 100644 index 00000000..0b5da39c --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/genai/app.py @@ -0,0 +1,163 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License + +import argparse +import glob +import json +import logging +import os +import time +from pathlib import Path + +import onnxruntime_genai as og + +logger = logging.getLogger(__name__) + + +def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): + curr_path = Path(current_dir).absolute() + target_dir = glob.glob(target_dir_name, root_dir=curr_path) + if target_dir: + return Path(curr_path / target_dir[0]).absolute() + else: + if curr_path.parent == curr_path: + # Root dir + return None + return _find_dir_contains_sub_dir(curr_path / "..", target_dir_name) + + +def _complete(text, state): + return (glob.glob(text + "*") + [None])[state] + + +def run(args: argparse.Namespace): + logger.info("Loading model...") + config = og.Config(args.model_path) + if args.execution_provider != "follow_config": + config.clear_providers() + if args.execution_provider != "cpu": + logger.info(f"Setting model to {args.execution_provider}...") + config.append_provider(args.execution_provider) + model = og.Model(config) + logger.info("Model loaded") + + tokenizer = og.Tokenizer(model) + processor = model.create_multimodal_processor() + stream = processor.create_stream() + + interactive = not args.non_interactive + + while True: + if interactive: + try: + import readline + + readline.set_completer_delims(" \t\n;") + readline.parse_and_bind("tab: complete") + readline.set_completer(_complete) + except ImportError: + # Not available on some platforms. Ignore it. + pass + image_paths = [ + image_path.strip() + for image_path in input("Image Path (comma separated; leave empty if no image): ").split(",") + ] + else: + if args.image_paths: + image_paths = args.image_paths + else: + image_paths = [str(Path(__file__).parent / "images" / "dog.jpg")] + + image_paths = [image_path for image_path in image_paths if image_path] + + images = None + if len(image_paths) == 0: + logger.info("No image provided") + else: + for i, image_path in enumerate(image_paths): + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + logger.info(f"Using image: {image_path}") + + images = og.Images.open(*image_paths) + + if interactive: + text = input("Prompt: ") + else: + if args.prompt: + text = args.prompt + else: + text = "What is shown in this image?" + + # Construct the "messages" argument passed to apply_chat_template + messages = [] + if model.type == "phi3v": + # Combine all image tags and text into one user message + content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text + messages.append({"role": "user", "content": content}) + else: + # Gemma3-style multimodal: structured content + content_list = [{"type": "image"} for _ in image_paths] + content_list.append({"type": "text", "text": text}) + messages.append({"role": "user", "content": content_list}) + + # Apply the chat template using the tokenizer + message_json = json.dumps(messages) + prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True) + + logger.info("Processing images and prompt...") + inputs = processor(prompt, images=images) + + logger.info("Generating response...") + params = og.GeneratorParams(model) + params.set_search_options(max_length=1024) + + generator = og.Generator(model, params) + generator.set_inputs(inputs) + start_time = time.time() + + while not generator.is_done(): + generator.generate_next_token() + + new_token = generator.get_next_tokens()[0] + logger.info(stream.decode(new_token), end="", flush=True) + + total_run_time = time.time() - start_time + logger.info(f"Total Time : {total_run_time:.2f}") + + # Delete the generator to free the captured graph before creating another one + del generator + + if not interactive: + break + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", "--model_path", type=str, default="", required=True, help="Path to the folder containing the model" + ) + parser.add_argument( + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "dml", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", + ) + parser.add_argument( + "--image_paths", nargs="*", type=str, required=False, help="Path to the images, mainly for CI usage" + ) + parser.add_argument( + "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage" + ) + parser.add_argument( + "--non-interactive", + action=argparse.BooleanOptionalAction, + default=True, + required=False, + help="Non-interactive mode, mainly for CI usage", + ) + args = parser.parse_args() + run(args) diff --git a/google-gemma-Gemma3-4B/qnn/genai/genai_config.json b/google-gemma-Gemma3-4B/qnn/genai/genai_config.json new file mode 100644 index 00000000..6916e407 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/genai/genai_config.json @@ -0,0 +1,429 @@ +{ + "model": { + "bos_token_id": 2, + "context_length": 131072, + "decoder": { + "session_options": { + "log_id": "onnxruntime-genai", + "provider_options": [ + ] + }, + "head_size": 256, + "hidden_size": 2560, + "inputs": { + "input_ids":"input_ids", + "inputs_embeds": "inputs_embeds", + "attention_mask": "attention_mask", + "past_key_names": "past_key_values.%d.key", + "past_value_names": "past_key_values.%d.value", + "past_sequence_length": "past_seq_len", + "total_sequence_length": "total_seq_len" + }, + "outputs": { + "logits": "logits", + "present_key_names": "present.%d.key", + "present_value_names": "present.%d.value" + }, + "num_attention_heads": 8, + "num_hidden_layers": 34, + "num_key_value_heads": 4, + "sliding_window": { + "window_size": 64, + "slide_key_value_cache": false, + "slide_inputs": true, + "pad_value": 0, + "alignment": "left" + }, + "pipeline": [ + { + "context_ctx": { + "filename": "context_ctx.onnx", + "inputs": [ + "/model/embed_tokens/Mul_output_cast_0_QuantizeLinear_Output", + "past_key_values.0.key", + "past_key_values.0.value", + "past_seq_len", + "total_seq_len", + "past_key_values.1.key", + "past_key_values.1.value", + "past_key_values.2.key", + "past_key_values.2.value", + "past_key_values.3.key", + "past_key_values.3.value", + "past_key_values.4.key", + "past_key_values.4.value", + "past_key_values.5.key", + "past_key_values.5.value", + "past_key_values.6.key", + "past_key_values.6.value", + "past_key_values.7.key", + "past_key_values.7.value", + "past_key_values.8.key", + "past_key_values.8.value", + "past_key_values.9.key", + "past_key_values.9.value", + "past_key_values.10.key", + "past_key_values.10.value", + "past_key_values.11.key", + "past_key_values.11.value", + "past_key_values.12.key", + "past_key_values.12.value", + "past_key_values.13.key", + "past_key_values.13.value", + "past_key_values.14.key", + "past_key_values.14.value", + "past_key_values.15.key", + "past_key_values.15.value", + "past_key_values.16.key", + "past_key_values.16.value", + "past_key_values.17.key", + "past_key_values.17.value", + "past_key_values.18.key", + "past_key_values.18.value", + "past_key_values.19.key", + "past_key_values.19.value", + "past_key_values.20.key", + "past_key_values.20.value", + "past_key_values.21.key", + "past_key_values.21.value", + "past_key_values.22.key", + "past_key_values.22.value", + "past_key_values.23.key", + "past_key_values.23.value", + "past_key_values.24.key", + "past_key_values.24.value", + "past_key_values.25.key", + "past_key_values.25.value", + "past_key_values.26.key", + "past_key_values.26.value", + "past_key_values.27.key", + "past_key_values.27.value", + "past_key_values.28.key", + "past_key_values.28.value", + "past_key_values.29.key", + "past_key_values.29.value", + "past_key_values.30.key", + "past_key_values.30.value", + "past_key_values.31.key", + "past_key_values.31.value", + "past_key_values.32.key", + "past_key_values.32.value", + "past_key_values.33.key", + "past_key_values.33.value" + ], + "outputs": [ + "present.0.key", + "present.0.value", + "present.1.key", + "present.1.value", + "present.2.key", + "present.2.value", + "present.3.key", + "present.3.value", + "present.4.key", + "present.4.value", + "present.5.key", + "present.5.value", + "present.6.key", + "present.6.value", + "present.7.key", + "present.7.value", + "present.8.key", + "present.8.value", + "present.9.key", + "present.9.value", + "present.10.key", + "present.10.value", + "present.11.key", + "present.11.value", + "present.12.key", + "present.12.value", + "present.13.key", + "present.13.value", + "present.14.key", + "present.14.value", + "present.15.key", + "present.15.value", + "present.16.key", + "present.16.value", + "present.17.key", + "present.17.value", + "present.18.key", + "present.18.value", + "present.19.key", + "present.19.value", + "present.20.key", + "present.20.value", + "present.21.key", + "present.21.value", + "present.22.key", + "present.22.value", + "present.23.key", + "present.23.value", + "present.24.key", + "present.24.value", + "present.25.key", + "present.25.value", + "present.26.key", + "present.26.value", + "present.27.key", + "present.27.value", + "present.28.key", + "present.28.value", + "present.29.key", + "present.29.value", + "present.30.key", + "present.30.value", + "present.31.key", + "present.31.value", + "present.32.key", + "present.32.value", + "present.33.key", + "present.33.value", + "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_cast_0_QuantizeLinear_Output" + ], + "session_options": { + "intra_op_num_threads": 2, + "inter_op_num_threads": 1, + "provider_options": [ + { + "qnn": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "soc_model": "60" + } + } + ] + }, + "run_on_token_gen": false + }, + "iterator_ctx": { + "filename": "iterator_ctx.onnx", + "inputs": [ + "/model/embed_tokens/Mul_output_cast_0_QuantizeLinear_Output", + "past_key_values.0.key", + "past_key_values.0.value", + "past_seq_len", + "total_seq_len", + "past_key_values.1.key", + "past_key_values.1.value", + "past_key_values.2.key", + "past_key_values.2.value", + "past_key_values.3.key", + "past_key_values.3.value", + "past_key_values.4.key", + "past_key_values.4.value", + "past_key_values.5.key", + "past_key_values.5.value", + "past_key_values.6.key", + "past_key_values.6.value", + "past_key_values.7.key", + "past_key_values.7.value", + "past_key_values.8.key", + "past_key_values.8.value", + "past_key_values.9.key", + "past_key_values.9.value", + "past_key_values.10.key", + "past_key_values.10.value", + "past_key_values.11.key", + "past_key_values.11.value", + "past_key_values.12.key", + "past_key_values.12.value", + "past_key_values.13.key", + "past_key_values.13.value", + "past_key_values.14.key", + "past_key_values.14.value", + "past_key_values.15.key", + "past_key_values.15.value", + "past_key_values.16.key", + "past_key_values.16.value", + "past_key_values.17.key", + "past_key_values.17.value", + "past_key_values.18.key", + "past_key_values.18.value", + "past_key_values.19.key", + "past_key_values.19.value", + "past_key_values.20.key", + "past_key_values.20.value", + "past_key_values.21.key", + "past_key_values.21.value", + "past_key_values.22.key", + "past_key_values.22.value", + "past_key_values.23.key", + "past_key_values.23.value", + "past_key_values.24.key", + "past_key_values.24.value", + "past_key_values.25.key", + "past_key_values.25.value", + "past_key_values.26.key", + "past_key_values.26.value", + "past_key_values.27.key", + "past_key_values.27.value", + "past_key_values.28.key", + "past_key_values.28.value", + "past_key_values.29.key", + "past_key_values.29.value", + "past_key_values.30.key", + "past_key_values.30.value", + "past_key_values.31.key", + "past_key_values.31.value", + "past_key_values.32.key", + "past_key_values.32.value", + "past_key_values.33.key", + "past_key_values.33.value" + ], + "outputs": [ + "present.0.key", + "present.0.value", + "present.1.key", + "present.1.value", + "present.2.key", + "present.2.value", + "present.3.key", + "present.3.value", + "present.4.key", + "present.4.value", + "present.5.key", + "present.5.value", + "present.6.key", + "present.6.value", + "present.7.key", + "present.7.value", + "present.8.key", + "present.8.value", + "present.9.key", + "present.9.value", + "present.10.key", + "present.10.value", + "present.11.key", + "present.11.value", + "present.12.key", + "present.12.value", + "present.13.key", + "present.13.value", + "present.14.key", + "present.14.value", + "present.15.key", + "present.15.value", + "present.16.key", + "present.16.value", + "present.17.key", + "present.17.value", + "present.18.key", + "present.18.value", + "present.19.key", + "present.19.value", + "present.20.key", + "present.20.value", + "present.21.key", + "present.21.value", + "present.22.key", + "present.22.value", + "present.23.key", + "present.23.value", + "present.24.key", + "present.24.value", + "present.25.key", + "present.25.value", + "present.26.key", + "present.26.value", + "present.27.key", + "present.27.value", + "present.28.key", + "present.28.value", + "present.29.key", + "present.29.value", + "present.30.key", + "present.30.value", + "present.31.key", + "present.31.value", + "present.32.key", + "present.32.value", + "present.33.key", + "present.33.value", + "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_cast_0_QuantizeLinear_Output" + ], + "session_options": { + "intra_op_num_threads": 2, + "inter_op_num_threads": 1, + "provider_options": [ + { + "qnn": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "soc_model": "60" + } + } + ] + }, + "run_on_prompt": false + }, + "lm_head": { + "filename": "lm_head.onnx", + "inputs": [ + "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_cast_0_QuantizeLinear_Output" + ], + "outputs": [ + "logits" + ] + } + } + ] + }, + "embedding": { + "filename": "embeddings_with_image.onnx", + "inputs": { + "input_ids": "input_ids", + "image_features": "image_features" + }, + "outputs": { + "inputs_embeds": "/model/embed_tokens/Mul_output_cast_0_QuantizeLinear_Output" + } + }, + "vision": { + "filename": "model_ctx_vision.onnx", + "inputs": { + "pixel_values": "pixel_values" + }, + "outputs": { + "image_features": "image_features" + }, + "session_options": { + "intra_op_num_threads": 2, + "inter_op_num_threads": 1, + "provider_options": [ + { + "qnn": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "soc_model": "60" + } + } + ] + } + }, + "eos_token_id": [ + 1, + 106 + ], + "pad_token_id": 0, + "type": "gemma3", + "vocab_size": 262208 + }, + "search": { + "diversity_penalty": 0.0, + "do_sample": true, + "early_stopping": true, + "length_penalty": 1.0, + "max_length": 131072, + "min_length": 0, + "no_repeat_ngram_size": 0, + "num_beams": 1, + "num_return_sequences": 1, + "past_present_share_buffer": true, + "repetition_penalty": 1.0, + "temperature": 1.0, + "top_k": 64, + "top_p": 0.95 + } +} \ No newline at end of file diff --git a/google-gemma-Gemma3-4B/qnn/genai/processor_config.json b/google-gemma-Gemma3-4B/qnn/genai/processor_config.json new file mode 100644 index 00000000..b25059aa --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/genai/processor_config.json @@ -0,0 +1,24 @@ +{ + "processor": { + "name": "gemma_3_image_processing", + "transforms": [ + { "operation": { "name": "decode_image", "type": "DecodeImage", "attrs": { "color_space": "RGB" } } }, + { + "operation": { + "name": "resize", + "type": "Resize", + "attrs": { "interpolation": "CUBIC", "width": 896, "height": 896, "keep_aspect_ratio": 0 } + } + }, + { "operation": { "name": "re-scale", "type": "Rescale" } }, + { + "operation": { + "name": "normalize", + "type": "Normalize", + "attrs": { "mean": [ 0.5, 0.5, 0.5 ], "std": [ 0.5, 0.5, 0.5 ] } + } + }, + { "operation": { "name": "to_channel_first", "type": "Permute3D", "attrs": { "dims": [ 2, 0, 1 ] } } } + ] + } +} diff --git a/google-gemma-Gemma3-4B/qnn/gptqmodel_int8.patch b/google-gemma-Gemma3-4B/qnn/gptqmodel_int8.patch new file mode 100644 index 00000000..7303604d --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/gptqmodel_int8.patch @@ -0,0 +1,40 @@ +--- "model (1).py" 2025-11-21 20:34:23.711972000 -0800 ++++ model.py 2025-11-21 20:34:23.750922000 -0800 +@@ -509,6 +509,11 @@ + return convert_gptq_v2_to_v1_format(model, quantize_config, qlinear_kernel), True + else: + return model, False ++def get_dynamic_bits(name, global_bits, overrides): ++ for pattern, config in overrides.items(): ++ if re.match(pattern.removeprefix("+:"), name): ++ return config.get("bits", global_bits) ++ return global_bits + + def convert_gptq_v2_to_v1_format_module( + module: BaseQuantLinear, +@@ -517,10 +522,10 @@ + assert isinstance(module, BaseQuantLinear) + + log.info.once("Format: Converting GPTQ v2 to v1") +- +- if quantize_config.bits == 2: ++ bits = quantize_config.bits if quantize_config.dynamic is None else get_dynamic_bits(module.name, quantize_config.bits, quantize_config.dynamic) ++ if bits == 2: + module.qzeros.data -= 0b01010101010101010101010101010101 +- elif quantize_config.bits == 3: ++ elif bits == 3: + module.qzeros.data[:, range(0, module.qzeros.data.shape[1], 3)] -= ( + 0b00100100100100100100100100100100 + ) +@@ -530,9 +535,9 @@ + module.qzeros.data[:, range(2, module.qzeros.data.shape[1], 3)] -= ( + 0b01001001001001001001001001001001 + ) +- elif quantize_config.bits == 4: ++ elif bits == 4: + module.qzeros.data -= 0b00010001000100010001000100010001 +- elif quantize_config.bits == 8: ++ elif bits == 8: + module.qzeros.data -= 0b00000001000000010000000100000001 + else: + raise NotImplementedError("Only 2,3,4,8 bits are supported.") diff --git a/google-gemma-Gemma3-4B/qnn/oga_patch1.patch b/google-gemma-Gemma3-4B/qnn/oga_patch1.patch new file mode 100644 index 00000000..e7952b8e --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/oga_patch1.patch @@ -0,0 +1,27 @@ +--- buildeer_orig.py 2025-11-21 22:56:02.640117000 -0800 ++++ builder.py 2025-11-21 22:58:33.933287000 -0800 +@@ -1898,8 +1898,14 @@ + # Unpack attention weights if needed + self.make_attention_unpacked(layer_id, attention, root_input, **kwargs) + ++ # Get dtype used for MatMul ops ++ q_dtype = getattr(attention.q_proj, "weight", getattr(attention.q_proj, "bits", None)) ++ k_dtype = getattr(attention.k_proj, "weight", getattr(attention.k_proj, "bits", None)) ++ v_dtype = getattr(attention.v_proj, "weight", getattr(attention.v_proj, "bits", None)) ++ qkv_dtype_equal = getattr(q_dtype, "dtype", q_dtype) == getattr(k_dtype, "dtype", k_dtype) == getattr(v_dtype, "dtype", v_dtype) ++ + # Make MatMul nodes +- if self.attention_attrs["use_packed_matmul"]: ++ if self.attention_attrs["use_packed_matmul"] and qkv_dtype_equal: + # Combine 3 MatMuls into 1 packed MatMul + qkv_matmul_basename = f"/model/layers.{layer_id}/attn/qkv_proj/MatMul" + qkv_matmul_name = self.make_packed_matmul(attention.q_proj, attention.k_proj, attention.v_proj, qkv_matmul_basename, root_input) +@@ -1921,7 +1927,7 @@ + v_bias_exists = attention.v_proj.bias is not None and torch.count_nonzero(attention.v_proj.bias) > 0 + all_bias_exists = q_bias_exists and k_bias_exists and v_bias_exists + +- if all_bias_exists and self.attention_attrs["use_packed_matmul"]: ++ if all_bias_exists and self.attention_attrs["use_packed_matmul"] and qkv_dtype_equal: + # Combine 3 Adds into 1 packed Add + qkv_add_name = f"/model/layers.{layer_id}/attn/qkv_proj/Add" + self.make_packed_add(attention.q_proj.bias, attention.k_proj.bias, attention.v_proj.bias, qkv_add_name, root_input=self.attention_attrs["q_path"]) diff --git a/google-gemma-Gemma3-4B/qnn/oga_patch2.patch b/google-gemma-Gemma3-4B/qnn/oga_patch2.patch new file mode 100644 index 00000000..9edfa869 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/oga_patch2.patch @@ -0,0 +1,26 @@ +--- quantized_model_orig.py 2025-11-21 22:56:22.644060000 -0800 ++++ quantized_model.py 2025-11-21 22:59:41.067178000 -0800 +@@ -863,6 +863,23 @@ + self.pack_qzeros(temp_module) + module.qzeros = temp_module.qzeros + ++ def _load_quant_config(self, quant_attrs): ++ super()._load_quant_config(quant_attrs) ++ self.overrides = quant_attrs["config"].get("dynamic", {}) ++ ++ def get_overrides(self, layer_name): ++ for pattern, overrides in self.overrides.items(): ++ if re.match(pattern.removeprefix("+:"), layer_name): ++ return overrides ++ return {} ++ ++ def get_layer_bits(self, layer_name): ++ return self.get_overrides(layer_name).get("bits", self.global_bits) ++ ++ def get_layer_group_size(self, layer_name): ++ return self.get_overrides(layer_name).get("group_size", self.global_group_size) ++ ++ + class QuarkModel(QuantizedModel): + def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, intermediate_size, num_layers): + super().__init__(quant_type, input_path, quant_attrs, q_size, kv_size, intermediate_size, num_layers) diff --git a/google-gemma-Gemma3-4B/qnn/qnn_req.txt b/google-gemma-Gemma3-4B/qnn/qnn_req.txt new file mode 100644 index 00000000..05c84579 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/qnn_req.txt @@ -0,0 +1,7 @@ +coloredlogs +flatbuffers +numpy >= 1.21.6 +packaging +protobuf +sympy +transformers==4.55.2 diff --git a/google-gemma-Gemma3-4B/qnn/requirements.txt b/google-gemma-Gemma3-4B/qnn/requirements.txt new file mode 100644 index 00000000..6f7f8caa --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn/requirements.txt @@ -0,0 +1,12 @@ +datasets +onnx==1.16.2 +onnx-ir==0.1.4 +onnxruntime-genai-cuda==0.9.0 +onnxruntime-gpu==1.22.0 +onnxscript +optimum +setuptools +tabulate +tokenizers +transformers==4.52.3 +tiktoken \ No newline at end of file