diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index ef6619ea..89a88148 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -12,7 +12,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -33,7 +34,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "CNN", "status": "Ready", @@ -55,7 +57,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -76,7 +79,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -97,7 +101,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -118,7 +123,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -139,7 +145,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -159,7 +166,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -180,7 +188,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -201,7 +210,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -223,7 +233,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", diff --git a/.aitk/docs/guide/ModelList.md b/.aitk/docs/guide/ModelList.md index 0ffc184b..d716a599 100644 --- a/.aitk/docs/guide/ModelList.md +++ b/.aitk/docs/guide/ModelList.md @@ -41,5 +41,5 @@ | [Clip Vit Base Patch16](https://huggingface.co/openai/clip-vit-base-patch16) | [Qualcomm NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn.json), [AMD NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch16/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch16/aitk/openai_clip_dml.json) | | [Clip Vit Base Patch32](https://huggingface.co/openai/clip-vit-base-patch32) | [Qualcomm NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn.json), [AMD NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch32/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch32/aitk/openai_clip_dml.json) | | [Clip Vit Large Patch14](https://huggingface.co/openai/clip-vit-large-patch14) | [Qualcomm NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_qnn.json), [AMD NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-large-patch14/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-large-patch14/aitk/openai_clip_dml.json) | -| [Resnet 50](https://huggingface.co/microsoft/resnet-50) | [Qualcomm NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_qnn.json), [AMD NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_amd.json), [AMD GPU](../../../microsoft-resnet-50/aitk/resnet_migraphx.json), [NVIDIA TensorRT for RTX](../../../microsoft-resnet-50/aitk/resnet_trtrtx.json), [Intel CPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel GPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel NPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [DirectML](../../../microsoft-resnet-50/aitk/resnet_dml.json) | -| [Vit Base Patch16 224](https://huggingface.co/google/vit-base-patch16-224) | [Qualcomm NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_qnn.json), [AMD NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_amd.json), [AMD GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_trtrtx.json), [Intel CPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel GPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel NPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [DirectML](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_dml.json) | +| [Resnet 50](https://huggingface.co/microsoft/resnet-50) | [Qualcomm NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_qnn.json), [AMD NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_amd.json), [AMD GPU](../../../microsoft-resnet-50/aitk/resnet_migraphx.json), [NVIDIA TensorRT for RTX](../../../microsoft-resnet-50/aitk/resnet_trtrtx.json), [Intel CPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel GPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel NPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [DirectML](../../../microsoft-resnet-50/aitk/resnet_dml.json), [WebGPU](../../../microsoft-resnet-50/aitk/resnet_webgpu.json) | +| [Vit Base Patch16 224](https://huggingface.co/google/vit-base-patch16-224) | [Qualcomm NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_qnn.json), [AMD NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_amd.json), [AMD GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_trtrtx.json), [Intel CPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel GPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel NPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [DirectML](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_dml.json), [WebGPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_webgpu.json) | diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md b/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md index 13a61646..fec0500c 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md @@ -7,7 +7,7 @@ This repository demonstrates the optimization of the [Qwen2.5-1.5B-Instruct](htt + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs** - OpenVINO for Intel® CPU/GPU/NPU + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation` -- Float downcasting for NVIDIA TRT for RTX GPU +- Float downcasting for NVIDIA TRT for RTX GPU / WebGPU for general GPU - DML for general GPU + This process uses AutoAWQ and ModelBuilder diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config index 8e9510e6..b61489fe 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config @@ -61,6 +61,25 @@ "dst": "qwen2_5_dml_config.json.config", "replacements": [] }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json", + "dst": "qwen2_5_webgpu_config.json", + "replacements": [ + { + "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "replace": "Qwen/Qwen2.5-1.5B-Instruct" + }, + { + "find": "model/deepseek", + "replace": "model/qwen2_5" + } + ] + }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json.config", + "dst": "qwen2_5_webgpu_config.json.config", + "replacements": [] + }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml b/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml index 3a7e04e6..244f7b57 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml @@ -34,6 +34,12 @@ recipes: - file: "qwen2_5_dml_config.json" device: gpu ep: DmlExecutionProvider + - file: "qwen2_5_webgpu_config.json" + device: gpu + ep: WebGpuExecutionProvider + - file: "qwen2_5_migraphx_config.json" + device: gpu + ep: MIGraphXExecutionProvider aitk: modelInfo: id: "huggingface/Qwen/Qwen2.5-1.5B-Instruct" diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config index 10029125..ab06c67f 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config @@ -23,6 +23,14 @@ { "file": "qwen2_5_dml_config.json", "templateName": "qwen2_5_dml_config" + }, + { + "file": "qwen2_5_webgpu_config.json", + "templateName": "qwen2_5_webgpu_config" + }, + { + "file": "qwen2_5_migraphx_config.json", + "templateName": "qwen2_5_migraphx_config" } ], "modelInfo": { diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu_config.json b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu_config.json new file mode 100644 index 00000000..d4ae439c --- /dev/null +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu_config.json @@ -0,0 +1,38 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen2.5-1.5B-Instruct" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "WebGpuExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "wikitext2_train", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "train" + }, + "pre_process_data_config": { + "strategy": "line-by-line", + "add_special_tokens": false, + "max_samples": 128, + "max_seq_len": 512 + } + } + ], + "passes": { + "builder": { "type": "ModelBuilder", "precision": "int4" } + }, + "target": "local_system", + "log_severity_level": 1, + "output_dir": "model/qwen2_5", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu_config.json.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu_config.json.config new file mode 100644 index 00000000..b3e6c90c --- /dev/null +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu_config.json.config @@ -0,0 +1,43 @@ +{ + "name": "Convert to WebGPU", + "oliveFile": "", + "isLLM": true, + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "builder" + }, + "needHFLogin": true, + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.builder", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md index 50d6122d..ca8e0c48 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md @@ -7,7 +7,7 @@ This repository demonstrates the optimization of the [DeepSeek-R1-Distill-Qwen-1 + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs** - OpenVINO for Intel® CPU/GPU/NPU + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation` -- Float downcasting for NVIDIA TRT for RTX GPU +- Float downcasting for NVIDIA TRT for RTX GPU / WebGPU for general GPU - DML for general GPU + This process uses AutoAWQ and ModelBuilder diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json new file mode 100644 index 00000000..fd193ded --- /dev/null +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json @@ -0,0 +1,38 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "WebGpuExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "wikitext2_train", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "train" + }, + "pre_process_data_config": { + "strategy": "line-by-line", + "add_special_tokens": false, + "max_samples": 128, + "max_seq_len": 512 + } + } + ], + "passes": { + "builder": { "type": "ModelBuilder", "precision": "int4" } + }, + "target": "local_system", + "log_severity_level": 1, + "output_dir": "model/deepseek", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json.config b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json.config new file mode 100644 index 00000000..b3e6c90c --- /dev/null +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json.config @@ -0,0 +1,43 @@ +{ + "name": "Convert to WebGPU", + "oliveFile": "", + "isLLM": true, + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "builder" + }, + "needHFLogin": true, + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.builder", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml index 1c560030..eb66c59f 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml @@ -34,6 +34,12 @@ recipes: - file: "deepseek_dml_config.json" device: gpu ep: DmlExecutionProvider + - file: "deepseek_webgpu_config.json" + device: gpu + ep: WebGpuExecutionProvider + - file: "deepseek_migraphx_config.json" + device: gpu + ep: MIGraphXExecutionProvider aitk: modelInfo: id: "huggingface/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config index 4c125a4b..6ec45979 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config @@ -23,6 +23,14 @@ { "file": "deepseek_dml_config.json", "templateName": "deepseek_dml_config" + }, + { + "file": "deepseek_webgpu_config.json", + "templateName": "deepseek_webgpu_config" + }, + { + "file": "deepseek_migraphx_config.json", + "templateName": "deepseek_migraphx_config" } ], "modelInfo": { diff --git a/google-bert-bert-base-multilingual-cased/aitk/README.md b/google-bert-bert-base-multilingual-cased/aitk/README.md index c6745c5c..867ec378 100644 --- a/google-bert-bert-base-multilingual-cased/aitk/README.md +++ b/google-bert-bert-base-multilingual-cased/aitk/README.md @@ -4,7 +4,7 @@ This folder contains examples of BERT optimization using different workflows. - QDQ for Qualcomm NPU / AMD NPU - OpenVINO for Intel® CPU/GPU/NPU -- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU / WebGPU for general GPU ## QDQ for Qualcomm NPU / AMD NPU diff --git a/google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_webgpu.json b/google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_webgpu.json new file mode 100644 index 00000000..962aba68 --- /dev/null +++ b/google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_webgpu.json @@ -0,0 +1,138 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google-bert/bert-base-multilingual-cased", + "task": "feature-extraction" + }, + "systems": { + "host_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + }, + "target_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "evaluation_data_config", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "facebook/xnli", + "subset": "en", + "split": "validation" + }, + "pre_process_data_config": { + "input_cols": [ + "premise" + ], + "padding": "max_length", + "max_length": 128, + "max_samples": 10 + }, + "dataloader_config": { + "batch_size": 1 + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg", + "priority": 1, + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20, + "save_as_external_data": true + }, + "transformer_optimizer": { + "type": "OrtTransformersOptimization", + "model_type": "bert", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": false, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": false, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "enable_rotary_embeddings": true + }, + "save_as_external_data": true + } + }, + "host": "host_system", + "target": "target_system", + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "model/google_bert", + "evaluate_input_model": false +} diff --git a/google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_webgpu.json.config b/google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_webgpu.json.config new file mode 100644 index 00000000..16d4d9bd --- /dev/null +++ b/google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_webgpu.json.config @@ -0,0 +1,123 @@ +{ + "name": "Convert to WebGPU", + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.target_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "facebook/xnli" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Subset", + "tags": [ + "EvaluationDatasetSubset", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": { + "path": "data_configs[0].load_dataset_config.subset", + "values": [ + "en", + "all_languages" + ], + "template": "EvaluationDatasetSubset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Quantization Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": "QuantizationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/google-bert-bert-base-multilingual-cased/aitk/info.yml b/google-bert-bert-base-multilingual-cased/aitk/info.yml index 42234846..f37be1bc 100644 --- a/google-bert-bert-base-multilingual-cased/aitk/info.yml +++ b/google-bert-bert-base-multilingual-cased/aitk/info.yml @@ -20,6 +20,9 @@ recipes: - file: "bert-base-multilingual-cased_dml.json" device: gpu ep: DmlExecutionProvider + - file: "bert-base-multilingual-cased_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider - file: "bert-base-multilingual-cased_migraphx.json" device: gpu ep: MIGraphXExecutionProvider diff --git a/google-bert-bert-base-multilingual-cased/aitk/model_project.config b/google-bert-bert-base-multilingual-cased/aitk/model_project.config index b817eb1f..92911e8a 100644 --- a/google-bert-bert-base-multilingual-cased/aitk/model_project.config +++ b/google-bert-bert-base-multilingual-cased/aitk/model_project.config @@ -20,6 +20,10 @@ "file": "bert-base-multilingual-cased_dml.json", "templateName": "bert-base-multilingual-cased_dml" }, + { + "file": "bert-base-multilingual-cased_webgpu.json", + "templateName": "bert-base-multilingual-cased_webgpu" + }, { "file": "bert-base-multilingual-cased_migraphx.json", "templateName": "bert-base-multilingual-cased_migraphx" diff --git a/google-vit-base-patch16-224/aitk/README.md b/google-vit-base-patch16-224/aitk/README.md index 7bb4b160..f2a46bb2 100644 --- a/google-vit-base-patch16-224/aitk/README.md +++ b/google-vit-base-patch16-224/aitk/README.md @@ -4,7 +4,7 @@ This folder contains examples of VIT optimization using different workflows. - QDQ for Qualcomm NPU / AMD NPU - OpenVINO for Intel® CPU/GPU/NPU -- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU / WebGPU for general GPU ## Optimization Workflows diff --git a/google-vit-base-patch16-224/aitk/inference_sample.ipynb b/google-vit-base-patch16-224/aitk/inference_sample.ipynb index 62936bbe..ece8e99e 100644 --- a/google-vit-base-patch16-224/aitk/inference_sample.ipynb +++ b/google-vit-base-patch16-224/aitk/inference_sample.ipynb @@ -42,6 +42,29 @@ "register_execution_providers()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/google-vit-base-patch16-224/aitk/info.yml b/google-vit-base-patch16-224/aitk/info.yml index 9e7ceb4f..50ea5f2c 100644 --- a/google-vit-base-patch16-224/aitk/info.yml +++ b/google-vit-base-patch16-224/aitk/info.yml @@ -20,6 +20,9 @@ recipes: - file: "vit-base-patch16-224_dml.json" device: gpu ep: DmlExecutionProvider + - file: "vit-base-patch16-224_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider - file: "vit-base-patch16-224_migraphx.json" device: gpu ep: MIGraphXExecutionProvider diff --git a/google-vit-base-patch16-224/aitk/model_project.config b/google-vit-base-patch16-224/aitk/model_project.config index 602d90f7..aeb99a06 100644 --- a/google-vit-base-patch16-224/aitk/model_project.config +++ b/google-vit-base-patch16-224/aitk/model_project.config @@ -20,6 +20,10 @@ "file": "vit-base-patch16-224_dml.json", "templateName": "vit-base-patch16-224_dml" }, + { + "file": "vit-base-patch16-224_webgpu.json", + "templateName": "vit-base-patch16-224_webgpu" + }, { "file": "vit-base-patch16-224_migraphx.json", "templateName": "vit-base-patch16-224_migraphx" diff --git a/google-vit-base-patch16-224/aitk/vit-base-patch16-224_dml_inference_sample.ipynb b/google-vit-base-patch16-224/aitk/vit-base-patch16-224_dml_inference_sample.ipynb new file mode 100644 index 00000000..19adf2a8 --- /dev/null +++ b/google-vit-base-patch16-224/aitk/vit-base-patch16-224_dml_inference_sample.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "\n", + "ExecutionProvider=\"DmlExecutionProvider\"\n", + "if ExecutionProvider == \"OpenVINOExecutionProvider\":\n", + " onnx_model_path = \"./model/ov_model_st_quant.onnx\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import onnxruntime as ort\n", + "import time\n", + "import torch\n", + "import torchvision.transforms as transforms\n", + "from datasets import load_dataset\n", + "from transformers import ViTFeatureExtractor, ViTForImageClassification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_samples = 256" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load datasets\n", + "\n", + "feature_extractor = ViTFeatureExtractor.from_pretrained(\"google/vit-base-patch16-224\")\n", + "preprocess = transforms.Compose([\n", + " transforms.Lambda(lambda img: img.convert(\"RGB\")),\n", + " transforms.Resize((224, 224)),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),\n", + "])\n", + "\n", + "def imageTransform(example):\n", + " example[\"image\"] = preprocess(example[\"image\"])\n", + " return example\n", + "datasetStream = load_dataset(\"timm/mini-imagenet\", split=\"validation\", streaming=True, trust_remote_code=True)\n", + "iterable_dataset = iter(datasetStream)\n", + "selected_samples = [next(iterable_dataset) for _ in range(num_samples)]\n", + "selected_samples = list(map(imageTransform, selected_samples))\n", + "\n", + "def get_imagenet_label_map():\n", + " import json\n", + " from pathlib import Path\n", + " cache_file = Path(f\"../../cache/data/imagenet_class_index.json\")\n", + " if not cache_file.exists():\n", + " import requests \n", + " imagenet_class_index_url = (\n", + " \"https://raw.githubusercontent.com/pytorch/vision/main/gallery/assets/imagenet_class_index.json\"\n", + " )\n", + " response = requests.get(imagenet_class_index_url)\n", + " response.raise_for_status() # Ensure the request was successful\n", + " content = response.json()\n", + " cache_file.parent.resolve().mkdir(parents=True, exist_ok=True)\n", + " with open(cache_file, \"w\") as f:\n", + " json.dump(content, f)\n", + " else:\n", + " with open(cache_file) as f:\n", + " content = json.loads(f.read())\n", + "\n", + " return {v[0]: int(k) for k, v in content.items()}\n", + "\n", + "label_map = get_imagenet_label_map()\n", + "label_names = datasetStream.features[\"label\"].names\n", + "\n", + "def mini_to_imagenet_label(mini_label):\n", + " class_name = label_names[mini_label]\n", + " return label_map[class_name]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Original model metrics\n", + "\n", + "def evaluate_torch(model, selected_samples, device):\n", + " model.eval()\n", + " correct, total = 0, 0\n", + " latencies = []\n", + " with torch.no_grad():\n", + " for example in selected_samples:\n", + " image = example[\"image\"].unsqueeze(0).to(device)\n", + " label = torch.tensor(example[\"label\"]).to(device)\n", + " label = mini_to_imagenet_label(label.item())\n", + " \n", + " start_time = time.time()\n", + " output = model(image)\n", + " end_time = time.time()\n", + " \n", + " latencies.append((end_time - start_time))\n", + " pred = torch.argmax(output.logits, dim=1)\n", + " correct += (pred == label).sum().item()\n", + " total += 1\n", + " \n", + " accuracy = correct / total\n", + " avg_latency = np.mean(latencies)\n", + " return accuracy, avg_latency\n", + "\n", + "device = torch.device(\"cpu\")\n", + "model = ViTForImageClassification.from_pretrained(\"google/vit-base-patch16-224\").to(device)\n", + "accuracy, avg_latency = evaluate_torch(model, selected_samples, device)\n", + "\n", + "print(f\"Original Model Accuracy: {accuracy * 100:.2f}%\")\n", + "print(f\"Original Model Average Latency Per Image: {avg_latency * 1000:.2f} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Quantized model metrics\n", + "\n", + "def evaluate_onnx(session, selected_samples):\n", + " correct, total = 0, 0\n", + " latencies = []\n", + " input_name = session.get_inputs()[0].name\n", + " output_name = session.get_outputs()[0].name\n", + "\n", + " for example in selected_samples:\n", + " image = np.expand_dims(example[\"image\"], axis=0)\n", + " label = example[\"label\"]\n", + " label = mini_to_imagenet_label(label)\n", + " \n", + " start_time = time.time()\n", + " output = session.run([output_name], {input_name: image.astype(np.float16)})[0]\n", + " end_time = time.time()\n", + " \n", + " latencies.append((end_time - start_time))\n", + " pred = np.argmax(output, axis=1)[0]\n", + " correct += (pred == label)\n", + " total += 1\n", + " \n", + " accuracy = correct / total\n", + " avg_latency = np.mean(latencies)\n", + " return accuracy, avg_latency\n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "session = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "accuracy, avg_latency = evaluate_onnx(session, selected_samples)\n", + "\n", + "print(f\"Quantized Model Accuracy: {accuracy * 100:.2f}%\")\n", + "print(f\"Quantized Model Average Latency Per Image: {avg_latency * 1000:.2f} ms\")" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/google-vit-base-patch16-224/aitk/vit-base-patch16-224_trtrtx_inference_sample.ipynb b/google-vit-base-patch16-224/aitk/vit-base-patch16-224_trtrtx_inference_sample.ipynb new file mode 100644 index 00000000..7b3c0075 --- /dev/null +++ b/google-vit-base-patch16-224/aitk/vit-base-patch16-224_trtrtx_inference_sample.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "\n", + "ExecutionProvider=\"NvTensorRTRTXExecutionProvider\"\n", + "if ExecutionProvider == \"OpenVINOExecutionProvider\":\n", + " onnx_model_path = \"./model/ov_model_st_quant.onnx\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import onnxruntime as ort\n", + "import time\n", + "import torch\n", + "import torchvision.transforms as transforms\n", + "from datasets import load_dataset\n", + "from transformers import ViTFeatureExtractor, ViTForImageClassification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_samples = 256" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load datasets\n", + "\n", + "feature_extractor = ViTFeatureExtractor.from_pretrained(\"google/vit-base-patch16-224\")\n", + "preprocess = transforms.Compose([\n", + " transforms.Lambda(lambda img: img.convert(\"RGB\")),\n", + " transforms.Resize((224, 224)),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),\n", + "])\n", + "\n", + "def imageTransform(example):\n", + " example[\"image\"] = preprocess(example[\"image\"])\n", + " return example\n", + "datasetStream = load_dataset(\"timm/mini-imagenet\", split=\"validation\", streaming=True, trust_remote_code=True)\n", + "iterable_dataset = iter(datasetStream)\n", + "selected_samples = [next(iterable_dataset) for _ in range(num_samples)]\n", + "selected_samples = list(map(imageTransform, selected_samples))\n", + "\n", + "def get_imagenet_label_map():\n", + " import json\n", + " from pathlib import Path\n", + " cache_file = Path(f\"../../cache/data/imagenet_class_index.json\")\n", + " if not cache_file.exists():\n", + " import requests \n", + " imagenet_class_index_url = (\n", + " \"https://raw.githubusercontent.com/pytorch/vision/main/gallery/assets/imagenet_class_index.json\"\n", + " )\n", + " response = requests.get(imagenet_class_index_url)\n", + " response.raise_for_status() # Ensure the request was successful\n", + " content = response.json()\n", + " cache_file.parent.resolve().mkdir(parents=True, exist_ok=True)\n", + " with open(cache_file, \"w\") as f:\n", + " json.dump(content, f)\n", + " else:\n", + " with open(cache_file) as f:\n", + " content = json.loads(f.read())\n", + "\n", + " return {v[0]: int(k) for k, v in content.items()}\n", + "\n", + "label_map = get_imagenet_label_map()\n", + "label_names = datasetStream.features[\"label\"].names\n", + "\n", + "def mini_to_imagenet_label(mini_label):\n", + " class_name = label_names[mini_label]\n", + " return label_map[class_name]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Original model metrics\n", + "\n", + "def evaluate_torch(model, selected_samples, device):\n", + " model.eval()\n", + " correct, total = 0, 0\n", + " latencies = []\n", + " with torch.no_grad():\n", + " for example in selected_samples:\n", + " image = example[\"image\"].unsqueeze(0).to(device)\n", + " label = torch.tensor(example[\"label\"]).to(device)\n", + " label = mini_to_imagenet_label(label.item())\n", + " \n", + " start_time = time.time()\n", + " output = model(image)\n", + " end_time = time.time()\n", + " \n", + " latencies.append((end_time - start_time))\n", + " pred = torch.argmax(output.logits, dim=1)\n", + " correct += (pred == label).sum().item()\n", + " total += 1\n", + " \n", + " accuracy = correct / total\n", + " avg_latency = np.mean(latencies)\n", + " return accuracy, avg_latency\n", + "\n", + "device = torch.device(\"cpu\")\n", + "model = ViTForImageClassification.from_pretrained(\"google/vit-base-patch16-224\").to(device)\n", + "accuracy, avg_latency = evaluate_torch(model, selected_samples, device)\n", + "\n", + "print(f\"Original Model Accuracy: {accuracy * 100:.2f}%\")\n", + "print(f\"Original Model Average Latency Per Image: {avg_latency * 1000:.2f} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Quantized model metrics\n", + "\n", + "def evaluate_onnx(session, selected_samples):\n", + " correct, total = 0, 0\n", + " latencies = []\n", + " input_name = session.get_inputs()[0].name\n", + " output_name = session.get_outputs()[0].name\n", + "\n", + " for example in selected_samples:\n", + " image = np.expand_dims(example[\"image\"], axis=0)\n", + " label = example[\"label\"]\n", + " label = mini_to_imagenet_label(label)\n", + " \n", + " start_time = time.time()\n", + " output = session.run([output_name], {input_name: image.astype(np.float16)})[0]\n", + " end_time = time.time()\n", + " \n", + " latencies.append((end_time - start_time))\n", + " pred = np.argmax(output, axis=1)[0]\n", + " correct += (pred == label)\n", + " total += 1\n", + " \n", + " accuracy = correct / total\n", + " avg_latency = np.mean(latencies)\n", + " return accuracy, avg_latency\n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "session = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "accuracy, avg_latency = evaluate_onnx(session, selected_samples)\n", + "\n", + "print(f\"Quantized Model Accuracy: {accuracy * 100:.2f}%\")\n", + "print(f\"Quantized Model Average Latency Per Image: {avg_latency * 1000:.2f} ms\")" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/google-vit-base-patch16-224/aitk/vit-base-patch16-224_webgpu.json b/google-vit-base-patch16-224/aitk/vit-base-patch16-224_webgpu.json new file mode 100644 index 00000000..c1b6866e --- /dev/null +++ b/google-vit-base-patch16-224/aitk/vit-base-patch16-224_webgpu.json @@ -0,0 +1,142 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google/vit-base-patch16-224", + "task": "image-classification", + "io_config": { + "input_names": [ + "pixel_values" + ], + "input_shapes": [ + [ + 1, + 3, + 224, + 224 + ] + ], + "output_names": [ + "output" + ] + } + }, + "systems": { + "host_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + }, + "target_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "evaluation_data_config", + "type": "HuggingfaceContainer", + "user_script": "vit-base-patch16-224.py", + "load_dataset_config": { + "data_name": "timm/mini-imagenet", + "split": "validation", + "streaming": true, + "trust_remote_code": true + }, + "pre_process_data_config": { + "type": "dataset_pre_process", + "size": 1000, + "cache_key": "imagedata_evaluation" + }, + "post_process_data_config": { + "type": "dataset_post_process" + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "accuracy", + "type": "accuracy", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "accuracy_score", + "priority": 1, + "metric_config": { + "task": "multiclass", + "num_classes": 1000 + } + } + ] + }, + { + "name": "latency", + "type": "latency", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg", + "priority": 2 + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20, + "save_as_external_data": true + }, + "transformer_optimizer": { + "type": "OrtTransformersOptimization", + "model_type": "vit", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": false, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": false, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "enable_rotary_embeddings": true + }, + "save_as_external_data": true + } + }, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "host_system", + "target": "target_system", + "cache_dir": "cache", + "output_dir": "model/vit" +} diff --git a/google-vit-base-patch16-224/aitk/vit-base-patch16-224_webgpu.json.config b/google-vit-base-patch16-224/aitk/vit-base-patch16-224_webgpu.json.config new file mode 100644 index 00000000..23c3f074 --- /dev/null +++ b/google-vit-base-patch16-224/aitk/vit-base-patch16-224_webgpu.json.config @@ -0,0 +1,104 @@ +{ + "name": "Convert to WebGPU", + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.target_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "timm/mini-imagenet", + "imagenet-1k" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "timm/mini-imagenet", + "imagenet-1k" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Evaluation Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.size", + "template": { + "path": "data_configs[0].pre_process_data_config.size", + "template": "EvaluationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/google-vit-base-patch16-224/aitk/vit-base-patch16-224_webgpu_inference_sample.ipynb b/google-vit-base-patch16-224/aitk/vit-base-patch16-224_webgpu_inference_sample.ipynb new file mode 100644 index 00000000..4f3ab189 --- /dev/null +++ b/google-vit-base-patch16-224/aitk/vit-base-patch16-224_webgpu_inference_sample.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "\n", + "ExecutionProvider=\"WebGpuExecutionProvider\"\n", + "if ExecutionProvider == \"OpenVINOExecutionProvider\":\n", + " onnx_model_path = \"./model/ov_model_st_quant.onnx\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import onnxruntime as ort\n", + "import time\n", + "import torch\n", + "import torchvision.transforms as transforms\n", + "from datasets import load_dataset\n", + "from transformers import ViTFeatureExtractor, ViTForImageClassification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_samples = 256" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load datasets\n", + "\n", + "feature_extractor = ViTFeatureExtractor.from_pretrained(\"google/vit-base-patch16-224\")\n", + "preprocess = transforms.Compose([\n", + " transforms.Lambda(lambda img: img.convert(\"RGB\")),\n", + " transforms.Resize((224, 224)),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),\n", + "])\n", + "\n", + "def imageTransform(example):\n", + " example[\"image\"] = preprocess(example[\"image\"])\n", + " return example\n", + "datasetStream = load_dataset(\"timm/mini-imagenet\", split=\"validation\", streaming=True, trust_remote_code=True)\n", + "iterable_dataset = iter(datasetStream)\n", + "selected_samples = [next(iterable_dataset) for _ in range(num_samples)]\n", + "selected_samples = list(map(imageTransform, selected_samples))\n", + "\n", + "def get_imagenet_label_map():\n", + " import json\n", + " from pathlib import Path\n", + " cache_file = Path(f\"../../cache/data/imagenet_class_index.json\")\n", + " if not cache_file.exists():\n", + " import requests \n", + " imagenet_class_index_url = (\n", + " \"https://raw.githubusercontent.com/pytorch/vision/main/gallery/assets/imagenet_class_index.json\"\n", + " )\n", + " response = requests.get(imagenet_class_index_url)\n", + " response.raise_for_status() # Ensure the request was successful\n", + " content = response.json()\n", + " cache_file.parent.resolve().mkdir(parents=True, exist_ok=True)\n", + " with open(cache_file, \"w\") as f:\n", + " json.dump(content, f)\n", + " else:\n", + " with open(cache_file) as f:\n", + " content = json.loads(f.read())\n", + "\n", + " return {v[0]: int(k) for k, v in content.items()}\n", + "\n", + "label_map = get_imagenet_label_map()\n", + "label_names = datasetStream.features[\"label\"].names\n", + "\n", + "def mini_to_imagenet_label(mini_label):\n", + " class_name = label_names[mini_label]\n", + " return label_map[class_name]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Original model metrics\n", + "\n", + "def evaluate_torch(model, selected_samples, device):\n", + " model.eval()\n", + " correct, total = 0, 0\n", + " latencies = []\n", + " with torch.no_grad():\n", + " for example in selected_samples:\n", + " image = example[\"image\"].unsqueeze(0).to(device)\n", + " label = torch.tensor(example[\"label\"]).to(device)\n", + " label = mini_to_imagenet_label(label.item())\n", + " \n", + " start_time = time.time()\n", + " output = model(image)\n", + " end_time = time.time()\n", + " \n", + " latencies.append((end_time - start_time))\n", + " pred = torch.argmax(output.logits, dim=1)\n", + " correct += (pred == label).sum().item()\n", + " total += 1\n", + " \n", + " accuracy = correct / total\n", + " avg_latency = np.mean(latencies)\n", + " return accuracy, avg_latency\n", + "\n", + "device = torch.device(\"cpu\")\n", + "model = ViTForImageClassification.from_pretrained(\"google/vit-base-patch16-224\").to(device)\n", + "accuracy, avg_latency = evaluate_torch(model, selected_samples, device)\n", + "\n", + "print(f\"Original Model Accuracy: {accuracy * 100:.2f}%\")\n", + "print(f\"Original Model Average Latency Per Image: {avg_latency * 1000:.2f} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Quantized model metrics\n", + "\n", + "def evaluate_onnx(session, selected_samples):\n", + " correct, total = 0, 0\n", + " latencies = []\n", + " input_name = session.get_inputs()[0].name\n", + " output_name = session.get_outputs()[0].name\n", + "\n", + " for example in selected_samples:\n", + " image = np.expand_dims(example[\"image\"], axis=0)\n", + " label = example[\"label\"]\n", + " label = mini_to_imagenet_label(label)\n", + " \n", + " start_time = time.time()\n", + " output = session.run([output_name], {input_name: image.astype(np.float16)})[0]\n", + " end_time = time.time()\n", + " \n", + " latencies.append((end_time - start_time))\n", + " pred = np.argmax(output, axis=1)[0]\n", + " correct += (pred == label)\n", + " total += 1\n", + " \n", + " accuracy = correct / total\n", + " avg_latency = np.mean(latencies)\n", + " return accuracy, avg_latency\n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "session = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "accuracy, avg_latency = evaluate_onnx(session, selected_samples)\n", + "\n", + "print(f\"Quantized Model Accuracy: {accuracy * 100:.2f}%\")\n", + "print(f\"Quantized Model Average Latency Per Image: {avg_latency * 1000:.2f} ms\")" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/intel-bert-base-uncased-mrpc/aitk/README.md b/intel-bert-base-uncased-mrpc/aitk/README.md index 66283471..0d59c1da 100644 --- a/intel-bert-base-uncased-mrpc/aitk/README.md +++ b/intel-bert-base-uncased-mrpc/aitk/README.md @@ -4,7 +4,7 @@ This folder contains examples of BERT optimization using different workflows. - QDQ for Qualcomm NPU / AMD NPU - OpenVINO for Intel® CPU/GPU/NPU -- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU / WebGPU for general GPU ## QDQ for Qualcomm NPU / AMD NPU diff --git a/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json new file mode 100644 index 00000000..26a24bb8 --- /dev/null +++ b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json @@ -0,0 +1,130 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Intel/bert-base-uncased-mrpc", + "task": "text-classification", + "load_kwargs": { + "attn_implementation": "eager" + } + }, + "systems": { + "host_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + }, + "target_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "glue_mrpc_eval", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "glue", + "subset": "mrpc", + "split": "validation" + }, + "pre_process_data_config": { + "max_length": 128, + "padding": "max_length", + "input_cols": [ + "sentence1", + "sentence2" + ], + "max_samples": 100 + }, + "dataloader_config": { + "batch_size": 1 + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "accuracy", + "type": "accuracy", + "data_config": "glue_mrpc_eval", + "sub_types": [ + { + "name": "accuracy_score", + "priority": 1 + }, + { + "name": "f1_score" + } + ] + }, + { + "name": "latency", + "type": "latency", + "data_config": "glue_mrpc_eval", + "sub_types": [ + { + "name": "avg", + "priority": 2 + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20, + "save_as_external_data": true + }, + "transformer_optimizer": { + "type": "OrtTransformersOptimization", + "model_type": "bert", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": false, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": false, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "enable_rotary_embeddings": true + }, + "save_as_external_data": true + } + }, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "host_system", + "target": "target_system", + "cache_dir": "cache", + "output_dir": "model/bert_webgpu" +} diff --git a/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json.config b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json.config new file mode 100644 index 00000000..b0f0ed25 --- /dev/null +++ b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json.config @@ -0,0 +1,102 @@ +{ + "name": "Convert to WebGPU", + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.target_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "glue" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "glue" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Evaluation Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": { + "path": "data_configs[0].pre_process_data_config.max_samples", + "template": "EvaluationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/intel-bert-base-uncased-mrpc/aitk/info.yml b/intel-bert-base-uncased-mrpc/aitk/info.yml index d9bb76a7..af232f23 100644 --- a/intel-bert-base-uncased-mrpc/aitk/info.yml +++ b/intel-bert-base-uncased-mrpc/aitk/info.yml @@ -23,6 +23,9 @@ recipes: - file: "bert_dml.json" device: gpu ep: DmlExecutionProvider + - file: "bert_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider - file: "bert_migraphx.json" device: gpu ep: MIGraphXExecutionProvider diff --git a/intel-bert-base-uncased-mrpc/aitk/model_project.config b/intel-bert-base-uncased-mrpc/aitk/model_project.config index 95de4455..fb63a594 100644 --- a/intel-bert-base-uncased-mrpc/aitk/model_project.config +++ b/intel-bert-base-uncased-mrpc/aitk/model_project.config @@ -20,6 +20,10 @@ "file": "bert_dml.json", "templateName": "bert_dml" }, + { + "file": "bert_webgpu.json", + "templateName": "bert_webgpu" + }, { "file": "bert_migraphx.json", "templateName": "bert_migraphx" diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/README.md b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/README.md index af5d33de..38216fb9 100644 --- a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/README.md +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/README.md @@ -4,7 +4,7 @@ This folder contains examples of Laion Clip optimization using different workflo - QDQ for Qualcomm NPU / AMD NPU - OpenVINO for Intel® CPU/GPU/NPU -- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU / WebGPU for general GPU ## Laion Clip optimization with QDQ for Qualcomm NPU / AMD NPU @@ -18,7 +18,7 @@ This workflow performs quantization with OpenVINO NNCF. It performs the optimiza - *HuggingFace Model -> OpenVINO Model -> Quantized OpenVINO model -> Quantized encapsulated ONNX OpenVINO IR model* -## Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +## Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU / WebGPU for general GPU It performs the optimization pipeline: diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/inference_sample.ipynb b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/inference_sample.ipynb index 8a9b4f5a..670b0cd2 100644 --- a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/inference_sample.ipynb +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/inference_sample.ipynb @@ -53,6 +53,34 @@ "register_execution_providers()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf6bb9b8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml index c717da13..2286f064 100644 --- a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml @@ -20,6 +20,9 @@ recipes: - file: "laion_clip_dml.json" device: gpu ep: DmlExecutionProvider + - file: "laion_clip_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider - file: "laion_clip_migraphx.json" device: gpu ep: MIGraphXExecutionProvider diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_dml_inference_sample.ipynb b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_dml_inference_sample.ipynb new file mode 100644 index 00000000..e174c596 --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_dml_inference_sample.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"DmlExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22477669", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.GPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values'].astype(np.float16)\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "winml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov_inference_sample.ipynb b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov_inference_sample.ipynb new file mode 100644 index 00000000..e09f24a8 --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov_inference_sample.ipynb @@ -0,0 +1,113 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/openvino_model_quant_st.onnx\"\n", + "ExecutionProvider=\"OpenVINOExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf6bb9b8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values']\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_text_qnn_inference_sample.ipynb b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_text_qnn_inference_sample.ipynb new file mode 100644 index 00000000..0884b6ac --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_text_qnn_inference_sample.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "43751a72", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"QNNExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0ea54b2", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "markdown", + "id": "897ffb42-3569-4d78-b99d-355a38fdce35", + "metadata": {}, + "source": [ + "### Data Processor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa8d84cd-4853-4746-bce3-b281bfc23d8b", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import CLIPProcessor\n", + "\n", + "processor = CLIPProcessor.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")" + ] + }, + { + "cell_type": "markdown", + "id": "5568eb71-5812-4c74-989c-c12271d33b12", + "metadata": {}, + "source": [ + "### Model Inference with ORT-QNN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02bad4ec-f477-4659-8584-00735f6ed5a9", + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "import torch\n", + "import numpy as np\n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "text_model = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "def get_text_embedding(text):\n", + " inputs = processor(\n", + " text=text,\n", + " padding=\"max_length\",\n", + " max_length=77,#text_model.sequence_length,\n", + " truncation=True,\n", + " add_special_tokens=True,\n", + " return_tensors=\"np\",\n", + " )\n", + " output = text_model.run(None, {\n", + " \"input_ids\": inputs[\"input_ids\"].astype(np.int32),\n", + " \"attention_mask\": inputs[\"attention_mask\"].astype(np.int32),\n", + " })\n", + " return torch.from_numpy(output[0])\n", + "\n", + "def calculate_score(emb_1, emb_2):\n", + " emb_1 /= torch.norm(emb_1, dim=-1, keepdim=True)\n", + " emb_2 /= torch.norm(emb_2, dim=-1, keepdim=True)\n", + " return torch.matmul(emb_1, emb_2.T) * 100.0\n", + "\n", + "# Get source embedding and calculate the similarity score for each target\n", + "# We need to process one by one because to static quantization, we fixed the batch size to 1\n", + "def ask(source, targets):\n", + " source_emb = get_text_embedding(source)\n", + " scores = []\n", + " for i, target in enumerate(targets):\n", + " target_emb = get_text_embedding(target)\n", + " score = calculate_score(source_emb, target_emb)\n", + " print(f\"Similarity score of sentence {i}:{score.item()}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "3477e36c-2e72-432b-ae81-602073a3754c", + "metadata": {}, + "source": [ + "### Play with Samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8cdc2a6-4c81-4f93-8426-065ee4c2b013", + "metadata": {}, + "outputs": [], + "source": [ + "ask(\"a photo containing two cats\", [\"a photo of tshirt\", \"a photo of two cats\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_trtrtx_inference_sample.ipynb b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_trtrtx_inference_sample.ipynb new file mode 100644 index 00000000..fc5e4a5f --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_trtrtx_inference_sample.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"NvTensorRTRTXExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22477669", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.GPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values'].astype(np.float16)\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "winml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_vision_qnn_inference_sample.ipynb b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_vision_qnn_inference_sample.ipynb new file mode 100644 index 00000000..aa8a8757 --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_vision_qnn_inference_sample.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3c18a7d6", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "\n", + "ExecutionProvider=\"QNNExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f82e3bca", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "markdown", + "id": "897ffb42-3569-4d78-b99d-355a38fdce35", + "metadata": {}, + "source": [ + "### Data Processor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa8d84cd-4853-4746-bce3-b281bfc23d8b", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import CLIPProcessor\n", + "\n", + "processor = CLIPProcessor.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\")" + ] + }, + { + "cell_type": "markdown", + "id": "5568eb71-5812-4c74-989c-c12271d33b12", + "metadata": {}, + "source": [ + "### Model Inference with ORT-QNN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02bad4ec-f477-4659-8584-00735f6ed5a9", + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "import torch\n", + "import numpy as np\n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "vision_model = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "def get_image_embedding(image):\n", + " inputs = processor(images=image, return_tensors=\"np\")\n", + " output = vision_model.run(None, { \"pixel_values\": inputs[\"pixel_values\"] })\n", + " return torch.from_numpy(output[0])\n", + "\n", + "def calculate_score(emb_1, emb_2):\n", + " emb_1 /= torch.norm(emb_1, dim=-1, keepdim=True)\n", + " emb_2 /= torch.norm(emb_2, dim=-1, keepdim=True)\n", + " return torch.matmul(emb_1, emb_2.T) * 100.0\n", + "\n", + "# Get source embedding and calculate the similarity score for each target\n", + "# We need to process one by one because to static quantization, we fixed the batch size to 1\n", + "def ask(source, targets):\n", + " source_emb = get_image_embedding(source)\n", + " for i, target in enumerate(targets):\n", + " target_emb = get_image_embedding(target)\n", + " score = calculate_score(source_emb, target_emb)\n", + " print(f\"Similarity score of image {i}:{score.item()}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "3477e36c-2e72-432b-ae81-602073a3754c", + "metadata": {}, + "source": [ + "### Play with Samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16868fbd-e447-4866-af7d-eb6e49975bcc", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from PIL import Image\n", + "\n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07076b9a", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://images.cocodataset.org/train2017/000000208833.jpg\"\n", + "image1 = Image.open(requests.get(url, stream=True).raw)\n", + "image1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c10de7cd", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://images.cocodataset.org/train2017/000000125690.jpg\"\n", + "image2 = Image.open(requests.get(url, stream=True).raw)\n", + "image2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8cdc2a6-4c81-4f93-8426-065ee4c2b013", + "metadata": {}, + "outputs": [], + "source": [ + "ask(image, [image1, image2])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json new file mode 100644 index 00000000..bb76bded --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json @@ -0,0 +1,181 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", + "task": "zero-shot-image-classification", + "load_kwargs": { + "attn_implementation": "eager" + }, + "io_config": { + "input_names": [ + "input_ids", + "pixel_values", + "attention_mask" + ], + "input_shapes": [ + [ + 10, + 77 + ], + [ + 1, + 3, + 224, + 224 + ], + [ + 10, + 77 + ] + ], + "input_types": [ + "int64", + "float32", + "int64" + ], + "output_names": [ + "logits_per_image", + "logits_per_text", + "text_embeds", + "image_embeds" + ], + "output_shapes": [ + [ + 1, + 10 + ], + [ + 10, + 1 + ], + [ + 10, + 512 + ], + [ + 1, + 512 + ] + ] + } + }, + "systems": { + "host_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + }, + "target_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "metric_data_config", + "user_script": "user_script.py", + "load_dataset_config": { + "type": "clip_dataset", + "model_name": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", + "dataset_name": "nlphuji/flickr30k", + "start": 0, + "end": 10 + }, + "dataloader_config": { + "type": "no_auto_batch_dataloader" + }, + "post_process_data_config": { + "type": "clip_post_process" + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "accuracy", + "type": "accuracy", + "backend": "huggingface_metrics", + "data_config": "metric_data_config", + "sub_types": [ + { + "name": "accuracy", + "priority": 1, + "goal": { + "type": "max-degradation", + "value": 0.05 + } + } + ] + }, + { + "name": "latency", + "type": "latency", + "data_config": "metric_data_config", + "sub_types": [ + { + "name": "avg", + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "metric_data_config", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "onnx_float_to_float16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true + } + }, + "search_strategy": false, + "host": "host_system", + "target": "target_system", + "cache_dir": "cache", + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "output_dir": "model/clip" +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json.config b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json.config new file mode 100644 index 00000000..d17c25fa --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json.config @@ -0,0 +1,84 @@ +{ + "name": "Convert to WebGPU", + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.target_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.dataset_name", + "values": [ + "nlphuji/flickr30k" + ], + "template": { + "path": "data_configs[0].load_dataset_config.dataset_name", + "values": [ + "nlphuji/flickr30k" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Size", + "type": "int", + "path": "data_configs[0].load_dataset_config.end", + "template": { + "path": "data_configs[0].load_dataset_config.end", + "template": "EvaluationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu_inference_sample.ipynb b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu_inference_sample.ipynb new file mode 100644 index 00000000..6938c9bb --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu_inference_sample.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"WebGpuExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22477669", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.GPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values'].astype(np.float16)\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "winml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config index f1670578..9f31e5be 100644 --- a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config @@ -20,6 +20,10 @@ "file": "laion_clip_dml.json", "templateName": "laion_clip_dml" }, + { + "file": "laion_clip_webgpu.json", + "templateName": "laion_clip_webgpu" + }, { "file": "laion_clip_migraphx.json", "templateName": "laion_clip_migraphx" diff --git a/meta-llama-Llama-3.1-8B-Instruct/aitk/README.md b/meta-llama-Llama-3.1-8B-Instruct/aitk/README.md index 612cdf2b..8efa8cc5 100644 --- a/meta-llama-Llama-3.1-8B-Instruct/aitk/README.md +++ b/meta-llama-Llama-3.1-8B-Instruct/aitk/README.md @@ -7,7 +7,7 @@ This repository demonstrates the optimization of the [Llama-3.1-8B-Instruct](htt + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs** - OpenVINO for Intel® CPU/GPU/NPU + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation` -- Float downcasting for NVIDIA TRT for RTX GPU +- Float downcasting for NVIDIA TRT for RTX GPU / WebGPU for general GPU - DML for general GPU + This process uses AutoAWQ and ModelBuilder diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md b/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md index 573bf132..b8873094 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md @@ -7,7 +7,7 @@ This repository demonstrates the optimization of the [Llama-3.2-1B-Instruct](htt + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs** - OpenVINO for Intel® CPU/GPU/NPU + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation` -- Float downcasting for NVIDIA TRT for RTX GPU +- Float downcasting for NVIDIA TRT for RTX GPU / WebGPU for general GPU - DML for general GPU + This process uses AutoAWQ and ModelBuilder diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config index c59e4aa4..c49d0c2f 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config @@ -61,6 +61,25 @@ "dst": "llama3_2_dml_config.json.config", "replacements": [] }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json", + "dst": "llama3_2_webgpu_config.json", + "replacements": [ + { + "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "replace": "meta-llama/Llama-3.2-1B-Instruct" + }, + { + "find": "model/deepseek", + "replace": "model/llama3_2" + } + ] + }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json.config", + "dst": "llama3_2_webgpu_config.json.config", + "replacements": [] + }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml b/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml index f159eae3..fb0a0e6c 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml @@ -34,6 +34,12 @@ recipes: - file: "llama3_2_dml_config.json" device: gpu ep: DmlExecutionProvider + - file: "llama3_2_webgpu_config.json" + device: gpu + ep: WebGpuExecutionProvider + - file: "llama3_2_migraphx_config.json" + device: gpu + ep: MIGraphXExecutionProvider aitk: modelInfo: id: "huggingface/meta-llama/Llama-3.2-1B-Instruct" diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu_config.json b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu_config.json new file mode 100644 index 00000000..4a2e2a3f --- /dev/null +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu_config.json @@ -0,0 +1,38 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "meta-llama/Llama-3.2-1B-Instruct" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "WebGpuExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "wikitext2_train", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "train" + }, + "pre_process_data_config": { + "strategy": "line-by-line", + "add_special_tokens": false, + "max_samples": 128, + "max_seq_len": 512 + } + } + ], + "passes": { + "builder": { "type": "ModelBuilder", "precision": "int4" } + }, + "target": "local_system", + "log_severity_level": 1, + "output_dir": "model/llama3_2", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu_config.json.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu_config.json.config new file mode 100644 index 00000000..b3e6c90c --- /dev/null +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu_config.json.config @@ -0,0 +1,43 @@ +{ + "name": "Convert to WebGPU", + "oliveFile": "", + "isLLM": true, + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "builder" + }, + "needHFLogin": true, + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.builder", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config index e0a1d7cb..e96ca946 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config @@ -23,6 +23,14 @@ { "file": "llama3_2_dml_config.json", "templateName": "llama3_2_dml_config" + }, + { + "file": "llama3_2_webgpu_config.json", + "templateName": "llama3_2_webgpu_config" + }, + { + "file": "llama3_2_migraphx_config.json", + "templateName": "llama3_2_migraphx_config" } ], "modelInfo": { diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/README.md b/microsoft-Phi-3.5-mini-instruct/aitk/README.md index cd635e33..c0bcb905 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/README.md +++ b/microsoft-Phi-3.5-mini-instruct/aitk/README.md @@ -7,7 +7,7 @@ This repository demonstrates the optimization of the [Microsoft Phi-3.5 Mini Ins + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs** - OpenVINO for Intel® CPU/GPU/NPU + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation` -- Float downcasting for NVIDIA TRT for RTX GPU +- Float downcasting for NVIDIA TRT for RTX GPU / WebGPU for general GPU - DML for general GPU + This process uses AutoAWQ and ModelBuilder diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config b/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config index 1142a71c..d02d1954 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config +++ b/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config @@ -29,8 +29,8 @@ ] }, { - "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json", - "dst": "phi3_5_dml_config.json", + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_config.json", + "dst": "phi3_5_ov_config.json", "replacements": [ { "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", @@ -47,6 +47,25 @@ "dst": "phi3_5_dml_config.json.config", "replacements": [] }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json", + "dst": "phi3_5_webgpu_config.json", + "replacements": [ + { + "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "replace": "microsoft/Phi-3.5-mini-instruct" + }, + { + "find": "model/deepseek", + "replace": "model/phi3_5" + } + ] + }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu_config.json.config", + "dst": "phi3_5_webgpu_config.json.config", + "replacements": [] + }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/info.yml b/microsoft-Phi-3.5-mini-instruct/aitk/info.yml index 890fb728..5eab37db 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/info.yml +++ b/microsoft-Phi-3.5-mini-instruct/aitk/info.yml @@ -34,6 +34,12 @@ recipes: - file: "phi3_5_dml_config.json" device: gpu ep: DmlExecutionProvider + - file: "phi3_5_webgpu_config.json" + device: gpu + ep: WebGpuExecutionProvider + - file: "phi3_5_migraphx_config.json" + device: gpu + ep: MIGraphXExecutionProvider aitk: modelInfo: id: "huggingface/microsoft/Phi-3.5-mini-instruct" diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config b/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config index 4b1a78b5..1444a95a 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config +++ b/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config @@ -23,6 +23,14 @@ { "file": "phi3_5_dml_config.json", "templateName": "phi3_5_dml_config" + }, + { + "file": "phi3_5_webgpu_config.json", + "templateName": "phi3_5_webgpu_config" + }, + { + "file": "phi3_5_migraphx_config.json", + "templateName": "phi3_5_migraphx_config" } ], "modelInfo": { diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu_config.json b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu_config.json new file mode 100644 index 00000000..3f1a4b36 --- /dev/null +++ b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu_config.json @@ -0,0 +1,38 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "microsoft/Phi-3.5-mini-instruct" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "WebGpuExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "wikitext2_train", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "train" + }, + "pre_process_data_config": { + "strategy": "line-by-line", + "add_special_tokens": false, + "max_samples": 128, + "max_seq_len": 512 + } + } + ], + "passes": { + "builder": { "type": "ModelBuilder", "precision": "int4" } + }, + "target": "local_system", + "log_severity_level": 1, + "output_dir": "model/phi3_5", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu_config.json.config b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu_config.json.config new file mode 100644 index 00000000..b3e6c90c --- /dev/null +++ b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu_config.json.config @@ -0,0 +1,43 @@ +{ + "name": "Convert to WebGPU", + "oliveFile": "", + "isLLM": true, + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "builder" + }, + "needHFLogin": true, + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.builder", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/microsoft-resnet-50/aitk/README.md b/microsoft-resnet-50/aitk/README.md index 37dd1448..ffd2ebc1 100644 --- a/microsoft-resnet-50/aitk/README.md +++ b/microsoft-resnet-50/aitk/README.md @@ -4,7 +4,7 @@ This folder contains examples of ResNet optimization using different workflows. - QDQ for Qualcomm NPU / AMD NPU - OpenVINO for Intel® CPU/GPU/NPU -- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU / WebGPU for general GPU ## QDQ for Qualcomm NPU / AMD NPU diff --git a/microsoft-resnet-50/aitk/inference_sample.ipynb b/microsoft-resnet-50/aitk/inference_sample.ipynb index e2a97c45..e9202f6a 100644 --- a/microsoft-resnet-50/aitk/inference_sample.ipynb +++ b/microsoft-resnet-50/aitk/inference_sample.ipynb @@ -45,6 +45,29 @@ "register_execution_providers()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/microsoft-resnet-50/aitk/info.yml b/microsoft-resnet-50/aitk/info.yml index 4978c8ff..343fb825 100644 --- a/microsoft-resnet-50/aitk/info.yml +++ b/microsoft-resnet-50/aitk/info.yml @@ -23,6 +23,9 @@ recipes: - file: "resnet_dml.json" device: gpu ep: DmlExecutionProvider + - file: "resnet_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/microsoft/resnet-50" diff --git a/microsoft-resnet-50/aitk/model_project.config b/microsoft-resnet-50/aitk/model_project.config index 2bd3c778..6c5defda 100644 --- a/microsoft-resnet-50/aitk/model_project.config +++ b/microsoft-resnet-50/aitk/model_project.config @@ -23,6 +23,10 @@ { "file": "resnet_dml.json", "templateName": "resnet_dml" + }, + { + "file": "resnet_webgpu.json", + "templateName": "resnet_webgpu" } ], "modelInfo": { diff --git a/microsoft-resnet-50/aitk/resnet_dml_inference_sample.ipynb b/microsoft-resnet-50/aitk/resnet_dml_inference_sample.ipynb new file mode 100644 index 00000000..f1f634e1 --- /dev/null +++ b/microsoft-resnet-50/aitk/resnet_dml_inference_sample.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"DmlExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from PIL import Image\n", + "url = \"https://onnxruntime.ai/images/dog.jpeg\"\n", + "response = requests.get(url)\n", + "# Save the image to a file\n", + "with open(\"dog.jpeg\", \"wb\") as file:\n", + " file.write(response.content)\n", + "img = Image.open(\"dog.jpeg\")\n", + "img" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "from PIL import Image\n", + "import torch\n", + "import torchvision.transforms as transforms\n", + "from torchvision.models.resnet import ResNet50_Weights\n", + "import numpy as np\n", + "\n", + "image_file_path = \"dog.jpeg\"\n", + "\n", + "# Create ONNX runtime session\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "session = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "print(\"Available providers:\", session.get_providers())\n", + "print(\"Current provider:\", session.get_provider_options())\n", + "\n", + "# Read and preprocess image\n", + "image = Image.open(image_file_path)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(image)\n", + "input_batch = input_tensor.unsqueeze(0)\n", + "\n", + "# Run inference\n", + "ort_inputs = {session.get_inputs()[0].name: input_batch.numpy().astype(np.float16)}\n", + "ort_outputs = session.run(None, ort_inputs)\n", + "\n", + "# Postprocess to get softmax vector\n", + "output = ort_outputs[0]\n", + "softmax = torch.nn.functional.softmax(torch.tensor(output), dim=1)\n", + "\n", + "# Extract top 10 predicted classes\n", + "top10 = torch.topk(softmax, 10)\n", + "\n", + "# Get label mapping\n", + "weights = ResNet50_Weights.DEFAULT\n", + "labels = weights.meta[\"categories\"]\n", + "\n", + "# Print results to console\n", + "print(\"Top 10 predictions for ResNet50 v2...\")\n", + "print(\"--------------------------------------------------------------\")\n", + "for i in range(10):\n", + " print(f\"Label: {labels[top10.indices[0][i]]}, Confidence: {top10.values[0][i].item():.4f}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/microsoft-resnet-50/aitk/resnet_trtrtx_inference_sample.ipynb b/microsoft-resnet-50/aitk/resnet_trtrtx_inference_sample.ipynb new file mode 100644 index 00000000..a9bdf35d --- /dev/null +++ b/microsoft-resnet-50/aitk/resnet_trtrtx_inference_sample.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"NvTensorRTRTXExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from PIL import Image\n", + "url = \"https://onnxruntime.ai/images/dog.jpeg\"\n", + "response = requests.get(url)\n", + "# Save the image to a file\n", + "with open(\"dog.jpeg\", \"wb\") as file:\n", + " file.write(response.content)\n", + "img = Image.open(\"dog.jpeg\")\n", + "img" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "from PIL import Image\n", + "import torch\n", + "import torchvision.transforms as transforms\n", + "from torchvision.models.resnet import ResNet50_Weights\n", + "import numpy as np\n", + "\n", + "image_file_path = \"dog.jpeg\"\n", + "\n", + "# Create ONNX runtime session\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "session = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "print(\"Available providers:\", session.get_providers())\n", + "print(\"Current provider:\", session.get_provider_options())\n", + "\n", + "# Read and preprocess image\n", + "image = Image.open(image_file_path)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(image)\n", + "input_batch = input_tensor.unsqueeze(0)\n", + "\n", + "# Run inference\n", + "ort_inputs = {session.get_inputs()[0].name: input_batch.numpy().astype(np.float16)}\n", + "ort_outputs = session.run(None, ort_inputs)\n", + "\n", + "# Postprocess to get softmax vector\n", + "output = ort_outputs[0]\n", + "softmax = torch.nn.functional.softmax(torch.tensor(output), dim=1)\n", + "\n", + "# Extract top 10 predicted classes\n", + "top10 = torch.topk(softmax, 10)\n", + "\n", + "# Get label mapping\n", + "weights = ResNet50_Weights.DEFAULT\n", + "labels = weights.meta[\"categories\"]\n", + "\n", + "# Print results to console\n", + "print(\"Top 10 predictions for ResNet50 v2...\")\n", + "print(\"--------------------------------------------------------------\")\n", + "for i in range(10):\n", + " print(f\"Label: {labels[top10.indices[0][i]]}, Confidence: {top10.values[0][i].item():.4f}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/microsoft-resnet-50/aitk/resnet_webgpu.json b/microsoft-resnet-50/aitk/resnet_webgpu.json new file mode 100644 index 00000000..e64119cf --- /dev/null +++ b/microsoft-resnet-50/aitk/resnet_webgpu.json @@ -0,0 +1,121 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "microsoft/resnet-50", + "task": "image-classification", + "io_config": { + "input_names": [ + "pixel_values" + ], + "input_shapes": [ + [ + 1, + 3, + 224, + 224 + ] + ], + "output_names": [ + "logits" + ] + } + }, + "systems": { + "host_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + }, + "target_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "evaluation_data_config", + "type": "HuggingfaceContainer", + "user_script": "imagenet.py", + "load_dataset_config": { + "data_name": "timm/mini-imagenet", + "split": "validation", + "streaming": true, + "trust_remote_code": true + }, + "pre_process_data_config": { + "type": "dataset_pre_process", + "size": 1000, + "cache_key": "imagedata_evaluation" + }, + "post_process_data_config": { + "type": "dataset_post_process" + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "accuracy", + "type": "accuracy", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "accuracy_score", + "priority": 1, + "metric_config": { + "task": "multiclass", + "num_classes": 1001 + } + } + ] + }, + { + "name": "latency", + "type": "latency", + "data_config": "evaluation_data_config", + "sub_types": [ + { + "name": "avg", + "priority": 2 + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "device": "cpu", + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "dynamic": false, + "use_dynamo_exporter": false + }, + "onnx_float_to_float16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true + } + }, + "host": "host_system", + "target": "target_system", + "evaluator": "common_evaluator", + "cache_dir": "cache", + "output_dir": "model/resnet_webgpu", + "evaluate_input_model": false +} diff --git a/microsoft-resnet-50/aitk/resnet_webgpu.json.config b/microsoft-resnet-50/aitk/resnet_webgpu.json.config new file mode 100644 index 00000000..23c3f074 --- /dev/null +++ b/microsoft-resnet-50/aitk/resnet_webgpu.json.config @@ -0,0 +1,104 @@ +{ + "name": "Convert to WebGPU", + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.target_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "timm/mini-imagenet", + "imagenet-1k" + ], + "template": { + "path": "data_configs[0].load_dataset_config.data_name", + "values": [ + "timm/mini-imagenet", + "imagenet-1k" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Split", + "tags": [ + "EvaluationDatasetSplit", + "DependsOnDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.split", + "values": [ + "train", + "validation", + "test" + ], + "template": { + "path": "data_configs[0].load_dataset_config.split", + "template": "EvaluationDatasetSplit" + } + }, + { + "name": "Evaluation Dataset Size", + "type": "int", + "path": "data_configs[0].pre_process_data_config.size", + "template": { + "path": "data_configs[0].pre_process_data_config.size", + "template": "EvaluationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/microsoft-resnet-50/aitk/resnet_webgpu_inference_sample.ipynb b/microsoft-resnet-50/aitk/resnet_webgpu_inference_sample.ipynb new file mode 100644 index 00000000..7cc8ad1c --- /dev/null +++ b/microsoft-resnet-50/aitk/resnet_webgpu_inference_sample.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"WebGpuExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from PIL import Image\n", + "url = \"https://onnxruntime.ai/images/dog.jpeg\"\n", + "response = requests.get(url)\n", + "# Save the image to a file\n", + "with open(\"dog.jpeg\", \"wb\") as file:\n", + " file.write(response.content)\n", + "img = Image.open(\"dog.jpeg\")\n", + "img" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "from PIL import Image\n", + "import torch\n", + "import torchvision.transforms as transforms\n", + "from torchvision.models.resnet import ResNet50_Weights\n", + "import numpy as np\n", + "\n", + "image_file_path = \"dog.jpeg\"\n", + "\n", + "# Create ONNX runtime session\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "session = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "print(\"Available providers:\", session.get_providers())\n", + "print(\"Current provider:\", session.get_provider_options())\n", + "\n", + "# Read and preprocess image\n", + "image = Image.open(image_file_path)\n", + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "input_tensor = preprocess(image)\n", + "input_batch = input_tensor.unsqueeze(0)\n", + "\n", + "# Run inference\n", + "ort_inputs = {session.get_inputs()[0].name: input_batch.numpy().astype(np.float16)}\n", + "ort_outputs = session.run(None, ort_inputs)\n", + "\n", + "# Postprocess to get softmax vector\n", + "output = ort_outputs[0]\n", + "softmax = torch.nn.functional.softmax(torch.tensor(output), dim=1)\n", + "\n", + "# Extract top 10 predicted classes\n", + "top10 = torch.topk(softmax, 10)\n", + "\n", + "# Get label mapping\n", + "weights = ResNet50_Weights.DEFAULT\n", + "labels = weights.meta[\"categories\"]\n", + "\n", + "# Print results to console\n", + "print(\"Top 10 predictions for ResNet50 v2...\")\n", + "print(\"--------------------------------------------------------------\")\n", + "for i in range(10):\n", + " print(f\"Label: {labels[top10.indices[0][i]]}, Confidence: {top10.values[0][i].item():.4f}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/openai-clip-vit-base-patch16/aitk/README.md b/openai-clip-vit-base-patch16/aitk/README.md index 44468f65..45de6a35 100644 --- a/openai-clip-vit-base-patch16/aitk/README.md +++ b/openai-clip-vit-base-patch16/aitk/README.md @@ -4,7 +4,7 @@ This folder contains examples of Openai Clip optimization using different workfl - QDQ for Qualcomm NPU / AMD NPU - OpenVINO for Intel® CPU/GPU/NPU -- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU / WebGPU for general GPU ## Openai Clip optimization with QDQ for Qualcomm NPU / AMD NPU @@ -18,7 +18,7 @@ This workflow performs quantization with OpenVINO NNCF. It performs the optimiza - *HuggingFace Model -> OpenVINO Model -> Quantized OpenVINO model -> Quantized encapsulated ONNX OpenVINO IR model* -## Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +## Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU / WebGPU for general GPU It performs the optimization pipeline: diff --git a/openai-clip-vit-base-patch16/aitk/inference_sample.ipynb b/openai-clip-vit-base-patch16/aitk/inference_sample.ipynb index cf26a73e..bc1a552f 100644 --- a/openai-clip-vit-base-patch16/aitk/inference_sample.ipynb +++ b/openai-clip-vit-base-patch16/aitk/inference_sample.ipynb @@ -53,6 +53,34 @@ "register_execution_providers()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf6bb9b8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/openai-clip-vit-base-patch16/aitk/info.yml b/openai-clip-vit-base-patch16/aitk/info.yml index cffa85db..d1124ea7 100644 --- a/openai-clip-vit-base-patch16/aitk/info.yml +++ b/openai-clip-vit-base-patch16/aitk/info.yml @@ -20,6 +20,9 @@ recipes: - file: "openai_clip_dml.json" device: gpu ep: DmlExecutionProvider + - file: "openai_clip_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider - file: "openai_clip_migraphx.json" device: gpu ep: MIGraphXExecutionProvider diff --git a/openai-clip-vit-base-patch16/aitk/model_project.config b/openai-clip-vit-base-patch16/aitk/model_project.config index b2e2714b..b3c86e66 100644 --- a/openai-clip-vit-base-patch16/aitk/model_project.config +++ b/openai-clip-vit-base-patch16/aitk/model_project.config @@ -20,6 +20,10 @@ "file": "openai_clip_dml.json", "templateName": "openai_clip_dml" }, + { + "file": "openai_clip_webgpu.json", + "templateName": "openai_clip_webgpu" + }, { "file": "openai_clip_migraphx.json", "templateName": "openai_clip_migraphx" diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_dml_inference_sample.ipynb b/openai-clip-vit-base-patch16/aitk/openai_clip_dml_inference_sample.ipynb new file mode 100644 index 00000000..83c5e565 --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_dml_inference_sample.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"DmlExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22477669", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.GPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values'].astype(np.float16)\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "winml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_ov_inference_sample.ipynb b/openai-clip-vit-base-patch16/aitk/openai_clip_ov_inference_sample.ipynb new file mode 100644 index 00000000..9ba20f48 --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_ov_inference_sample.ipynb @@ -0,0 +1,113 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/openvino_model_quant_st.onnx\"\n", + "ExecutionProvider=\"OpenVINOExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf6bb9b8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values']\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_text_qnn_inference_sample.ipynb b/openai-clip-vit-base-patch16/aitk/openai_clip_text_qnn_inference_sample.ipynb new file mode 100644 index 00000000..46a0e8d6 --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_text_qnn_inference_sample.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "43751a72", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"QNNExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0ea54b2", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "markdown", + "id": "897ffb42-3569-4d78-b99d-355a38fdce35", + "metadata": {}, + "source": [ + "### Data Processor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa8d84cd-4853-4746-bce3-b281bfc23d8b", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import CLIPProcessor\n", + "\n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\")" + ] + }, + { + "cell_type": "markdown", + "id": "5568eb71-5812-4c74-989c-c12271d33b12", + "metadata": {}, + "source": [ + "### Model Inference with ORT-QNN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02bad4ec-f477-4659-8584-00735f6ed5a9", + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "import torch\n", + "import numpy as np\n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "text_model = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "def get_text_embedding(text):\n", + " inputs = processor(\n", + " text=text,\n", + " padding=\"max_length\",\n", + " max_length=77,#text_model.sequence_length,\n", + " truncation=True,\n", + " add_special_tokens=True,\n", + " return_tensors=\"np\",\n", + " )\n", + " output = text_model.run(None, {\n", + " \"input_ids\": inputs[\"input_ids\"].astype(np.int32),\n", + " \"attention_mask\": inputs[\"attention_mask\"].astype(np.int32),\n", + " })\n", + " return torch.from_numpy(output[0])\n", + "\n", + "def calculate_score(emb_1, emb_2):\n", + " emb_1 /= torch.norm(emb_1, dim=-1, keepdim=True)\n", + " emb_2 /= torch.norm(emb_2, dim=-1, keepdim=True)\n", + " return torch.matmul(emb_1, emb_2.T) * 100.0\n", + "\n", + "# Get source embedding and calculate the similarity score for each target\n", + "# We need to process one by one because to static quantization, we fixed the batch size to 1\n", + "def ask(source, targets):\n", + " source_emb = get_text_embedding(source)\n", + " scores = []\n", + " for i, target in enumerate(targets):\n", + " target_emb = get_text_embedding(target)\n", + " score = calculate_score(source_emb, target_emb)\n", + " print(f\"Similarity score of sentence {i}:{score.item()}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "3477e36c-2e72-432b-ae81-602073a3754c", + "metadata": {}, + "source": [ + "### Play with Samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8cdc2a6-4c81-4f93-8426-065ee4c2b013", + "metadata": {}, + "outputs": [], + "source": [ + "ask(\"a photo containing two cats\", [\"a photo of tshirt\", \"a photo of two cats\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_trtrtx_inference_sample.ipynb b/openai-clip-vit-base-patch16/aitk/openai_clip_trtrtx_inference_sample.ipynb new file mode 100644 index 00000000..2343edf0 --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_trtrtx_inference_sample.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"NvTensorRTRTXExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22477669", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.GPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values'].astype(np.float16)\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "winml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_vision_qnn_inference_sample.ipynb b/openai-clip-vit-base-patch16/aitk/openai_clip_vision_qnn_inference_sample.ipynb new file mode 100644 index 00000000..f90ea43a --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_vision_qnn_inference_sample.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3c18a7d6", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "\n", + "ExecutionProvider=\"QNNExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f82e3bca", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "markdown", + "id": "897ffb42-3569-4d78-b99d-355a38fdce35", + "metadata": {}, + "source": [ + "### Data Processor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa8d84cd-4853-4746-bce3-b281bfc23d8b", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import CLIPProcessor\n", + "\n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\")" + ] + }, + { + "cell_type": "markdown", + "id": "5568eb71-5812-4c74-989c-c12271d33b12", + "metadata": {}, + "source": [ + "### Model Inference with ORT-QNN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02bad4ec-f477-4659-8584-00735f6ed5a9", + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "import torch\n", + "import numpy as np\n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "vision_model = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "def get_image_embedding(image):\n", + " inputs = processor(images=image, return_tensors=\"np\")\n", + " output = vision_model.run(None, { \"pixel_values\": inputs[\"pixel_values\"] })\n", + " return torch.from_numpy(output[0])\n", + "\n", + "def calculate_score(emb_1, emb_2):\n", + " emb_1 /= torch.norm(emb_1, dim=-1, keepdim=True)\n", + " emb_2 /= torch.norm(emb_2, dim=-1, keepdim=True)\n", + " return torch.matmul(emb_1, emb_2.T) * 100.0\n", + "\n", + "# Get source embedding and calculate the similarity score for each target\n", + "# We need to process one by one because to static quantization, we fixed the batch size to 1\n", + "def ask(source, targets):\n", + " source_emb = get_image_embedding(source)\n", + " for i, target in enumerate(targets):\n", + " target_emb = get_image_embedding(target)\n", + " score = calculate_score(source_emb, target_emb)\n", + " print(f\"Similarity score of image {i}:{score.item()}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "3477e36c-2e72-432b-ae81-602073a3754c", + "metadata": {}, + "source": [ + "### Play with Samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16868fbd-e447-4866-af7d-eb6e49975bcc", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from PIL import Image\n", + "\n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07076b9a", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://images.cocodataset.org/train2017/000000208833.jpg\"\n", + "image1 = Image.open(requests.get(url, stream=True).raw)\n", + "image1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c10de7cd", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://images.cocodataset.org/train2017/000000125690.jpg\"\n", + "image2 = Image.open(requests.get(url, stream=True).raw)\n", + "image2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8cdc2a6-4c81-4f93-8426-065ee4c2b013", + "metadata": {}, + "outputs": [], + "source": [ + "ask(image, [image1, image2])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json new file mode 100644 index 00000000..39b77871 --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json @@ -0,0 +1,181 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "openai/clip-vit-base-patch16", + "task": "zero-shot-image-classification", + "load_kwargs": { + "attn_implementation": "eager" + }, + "io_config": { + "input_names": [ + "input_ids", + "pixel_values", + "attention_mask" + ], + "input_shapes": [ + [ + 10, + 77 + ], + [ + 1, + 3, + 224, + 224 + ], + [ + 10, + 77 + ] + ], + "input_types": [ + "int64", + "float32", + "int64" + ], + "output_names": [ + "logits_per_image", + "logits_per_text", + "text_embeds", + "image_embeds" + ], + "output_shapes": [ + [ + 1, + 10 + ], + [ + 10, + 1 + ], + [ + 10, + 512 + ], + [ + 1, + 512 + ] + ] + } + }, + "systems": { + "host_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + }, + "target_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "metric_data_config", + "user_script": "user_script.py", + "load_dataset_config": { + "type": "clip_dataset", + "model_name": "openai/clip-vit-base-patch16", + "dataset_name": "nlphuji/flickr30k", + "start": 0, + "end": 10 + }, + "dataloader_config": { + "type": "no_auto_batch_dataloader" + }, + "post_process_data_config": { + "type": "clip_post_process" + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "accuracy", + "type": "accuracy", + "backend": "huggingface_metrics", + "data_config": "metric_data_config", + "sub_types": [ + { + "name": "accuracy", + "priority": 1, + "goal": { + "type": "max-degradation", + "value": 0.05 + } + } + ] + }, + { + "name": "latency", + "type": "latency", + "data_config": "metric_data_config", + "sub_types": [ + { + "name": "avg", + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "metric_data_config", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "onnx_float_to_float16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true + } + }, + "search_strategy": false, + "host": "host_system", + "target": "target_system", + "cache_dir": "cache", + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "output_dir": "model/clip" +} diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config new file mode 100644 index 00000000..d17c25fa --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config @@ -0,0 +1,84 @@ +{ + "name": "Convert to WebGPU", + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.target_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.dataset_name", + "values": [ + "nlphuji/flickr30k" + ], + "template": { + "path": "data_configs[0].load_dataset_config.dataset_name", + "values": [ + "nlphuji/flickr30k" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Size", + "type": "int", + "path": "data_configs[0].load_dataset_config.end", + "template": { + "path": "data_configs[0].load_dataset_config.end", + "template": "EvaluationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu_inference_sample.ipynb b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu_inference_sample.ipynb new file mode 100644 index 00000000..9e2d7c1c --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu_inference_sample.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"WebGpuExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22477669", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.GPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values'].astype(np.float16)\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "winml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch32/aitk/README.md b/openai-clip-vit-base-patch32/aitk/README.md index 44468f65..45de6a35 100644 --- a/openai-clip-vit-base-patch32/aitk/README.md +++ b/openai-clip-vit-base-patch32/aitk/README.md @@ -4,7 +4,7 @@ This folder contains examples of Openai Clip optimization using different workfl - QDQ for Qualcomm NPU / AMD NPU - OpenVINO for Intel® CPU/GPU/NPU -- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +- Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU / WebGPU for general GPU ## Openai Clip optimization with QDQ for Qualcomm NPU / AMD NPU @@ -18,7 +18,7 @@ This workflow performs quantization with OpenVINO NNCF. It performs the optimiza - *HuggingFace Model -> OpenVINO Model -> Quantized OpenVINO model -> Quantized encapsulated ONNX OpenVINO IR model* -## Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU +## Float downcasting for NVIDIA TRT for RTX GPU / DML for general GPU / WebGPU for general GPU It performs the optimization pipeline: diff --git a/openai-clip-vit-base-patch32/aitk/inference_sample.ipynb b/openai-clip-vit-base-patch32/aitk/inference_sample.ipynb index 86ccf8fe..325e9da9 100644 --- a/openai-clip-vit-base-patch32/aitk/inference_sample.ipynb +++ b/openai-clip-vit-base-patch32/aitk/inference_sample.ipynb @@ -53,6 +53,34 @@ "register_execution_providers()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf6bb9b8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/openai-clip-vit-base-patch32/aitk/info.yml b/openai-clip-vit-base-patch32/aitk/info.yml index fb8fef2e..a76e4301 100644 --- a/openai-clip-vit-base-patch32/aitk/info.yml +++ b/openai-clip-vit-base-patch32/aitk/info.yml @@ -20,6 +20,9 @@ recipes: - file: "openai_clip_dml.json" device: gpu ep: DmlExecutionProvider + - file: "openai_clip_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider - file: "openai_clip_migraphx.json" device: gpu ep: MIGraphXExecutionProvider diff --git a/openai-clip-vit-base-patch32/aitk/model_project.config b/openai-clip-vit-base-patch32/aitk/model_project.config index 20bbc8f8..8ec217a7 100644 --- a/openai-clip-vit-base-patch32/aitk/model_project.config +++ b/openai-clip-vit-base-patch32/aitk/model_project.config @@ -20,6 +20,10 @@ "file": "openai_clip_dml.json", "templateName": "openai_clip_dml" }, + { + "file": "openai_clip_webgpu.json", + "templateName": "openai_clip_webgpu" + }, { "file": "openai_clip_migraphx.json", "templateName": "openai_clip_migraphx" diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_dml_inference_sample.ipynb b/openai-clip-vit-base-patch32/aitk/openai_clip_dml_inference_sample.ipynb new file mode 100644 index 00000000..7e2f5cc6 --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_dml_inference_sample.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"DmlExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22477669", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.GPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values'].astype(np.float16)\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "winml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_ov_inference_sample.ipynb b/openai-clip-vit-base-patch32/aitk/openai_clip_ov_inference_sample.ipynb new file mode 100644 index 00000000..03e0fc89 --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_ov_inference_sample.ipynb @@ -0,0 +1,113 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/openvino_model_quant_st.onnx\"\n", + "ExecutionProvider=\"OpenVINOExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf6bb9b8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values']\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_text_qnn_inference_sample.ipynb b/openai-clip-vit-base-patch32/aitk/openai_clip_text_qnn_inference_sample.ipynb new file mode 100644 index 00000000..347c9d15 --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_text_qnn_inference_sample.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "43751a72", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"QNNExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0ea54b2", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "markdown", + "id": "897ffb42-3569-4d78-b99d-355a38fdce35", + "metadata": {}, + "source": [ + "### Data Processor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa8d84cd-4853-4746-bce3-b281bfc23d8b", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import CLIPProcessor\n", + "\n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")" + ] + }, + { + "cell_type": "markdown", + "id": "5568eb71-5812-4c74-989c-c12271d33b12", + "metadata": {}, + "source": [ + "### Model Inference with ORT-QNN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02bad4ec-f477-4659-8584-00735f6ed5a9", + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "import torch\n", + "import numpy as np\n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "text_model = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "def get_text_embedding(text):\n", + " inputs = processor(\n", + " text=text,\n", + " padding=\"max_length\",\n", + " max_length=77,#text_model.sequence_length,\n", + " truncation=True,\n", + " add_special_tokens=True,\n", + " return_tensors=\"np\",\n", + " )\n", + " output = text_model.run(None, {\n", + " \"input_ids\": inputs[\"input_ids\"].astype(np.int32),\n", + " \"attention_mask\": inputs[\"attention_mask\"].astype(np.int32),\n", + " })\n", + " return torch.from_numpy(output[0])\n", + "\n", + "def calculate_score(emb_1, emb_2):\n", + " emb_1 /= torch.norm(emb_1, dim=-1, keepdim=True)\n", + " emb_2 /= torch.norm(emb_2, dim=-1, keepdim=True)\n", + " return torch.matmul(emb_1, emb_2.T) * 100.0\n", + "\n", + "# Get source embedding and calculate the similarity score for each target\n", + "# We need to process one by one because to static quantization, we fixed the batch size to 1\n", + "def ask(source, targets):\n", + " source_emb = get_text_embedding(source)\n", + " scores = []\n", + " for i, target in enumerate(targets):\n", + " target_emb = get_text_embedding(target)\n", + " score = calculate_score(source_emb, target_emb)\n", + " print(f\"Similarity score of sentence {i}:{score.item()}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "3477e36c-2e72-432b-ae81-602073a3754c", + "metadata": {}, + "source": [ + "### Play with Samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8cdc2a6-4c81-4f93-8426-065ee4c2b013", + "metadata": {}, + "outputs": [], + "source": [ + "ask(\"a photo containing two cats\", [\"a photo of tshirt\", \"a photo of two cats\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_trtrtx_inference_sample.ipynb b/openai-clip-vit-base-patch32/aitk/openai_clip_trtrtx_inference_sample.ipynb new file mode 100644 index 00000000..4c1986a4 --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_trtrtx_inference_sample.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"NvTensorRTRTXExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22477669", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.GPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values'].astype(np.float16)\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "winml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_vision_qnn_inference_sample.ipynb b/openai-clip-vit-base-patch32/aitk/openai_clip_vision_qnn_inference_sample.ipynb new file mode 100644 index 00000000..0863f581 --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_vision_qnn_inference_sample.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3c18a7d6", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "\n", + "ExecutionProvider=\"QNNExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f82e3bca", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "markdown", + "id": "897ffb42-3569-4d78-b99d-355a38fdce35", + "metadata": {}, + "source": [ + "### Data Processor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa8d84cd-4853-4746-bce3-b281bfc23d8b", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import CLIPProcessor\n", + "\n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")" + ] + }, + { + "cell_type": "markdown", + "id": "5568eb71-5812-4c74-989c-c12271d33b12", + "metadata": {}, + "source": [ + "### Model Inference with ORT-QNN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02bad4ec-f477-4659-8584-00735f6ed5a9", + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "import torch\n", + "import numpy as np\n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + "\n", + "\n", + "session_options = ort.SessionOptions()\n", + "\n", + "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n", + "\n", + "vision_model = ort.InferenceSession(\n", + " onnx_model_path, # a model with QNN EPContext nodes\n", + " sess_options=session_options,\n", + ")\n", + "\n", + "def get_image_embedding(image):\n", + " inputs = processor(images=image, return_tensors=\"np\")\n", + " output = vision_model.run(None, { \"pixel_values\": inputs[\"pixel_values\"] })\n", + " return torch.from_numpy(output[0])\n", + "\n", + "def calculate_score(emb_1, emb_2):\n", + " emb_1 /= torch.norm(emb_1, dim=-1, keepdim=True)\n", + " emb_2 /= torch.norm(emb_2, dim=-1, keepdim=True)\n", + " return torch.matmul(emb_1, emb_2.T) * 100.0\n", + "\n", + "# Get source embedding and calculate the similarity score for each target\n", + "# We need to process one by one because to static quantization, we fixed the batch size to 1\n", + "def ask(source, targets):\n", + " source_emb = get_image_embedding(source)\n", + " for i, target in enumerate(targets):\n", + " target_emb = get_image_embedding(target)\n", + " score = calculate_score(source_emb, target_emb)\n", + " print(f\"Similarity score of image {i}:{score.item()}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "3477e36c-2e72-432b-ae81-602073a3754c", + "metadata": {}, + "source": [ + "### Play with Samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16868fbd-e447-4866-af7d-eb6e49975bcc", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from PIL import Image\n", + "\n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07076b9a", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://images.cocodataset.org/train2017/000000208833.jpg\"\n", + "image1 = Image.open(requests.get(url, stream=True).raw)\n", + "image1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c10de7cd", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://images.cocodataset.org/train2017/000000125690.jpg\"\n", + "image2 = Image.open(requests.get(url, stream=True).raw)\n", + "image2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8cdc2a6-4c81-4f93-8426-065ee4c2b013", + "metadata": {}, + "outputs": [], + "source": [ + "ask(image, [image1, image2])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json new file mode 100644 index 00000000..6198b710 --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json @@ -0,0 +1,181 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "openai/clip-vit-base-patch32", + "task": "zero-shot-image-classification", + "load_kwargs": { + "attn_implementation": "eager" + }, + "io_config": { + "input_names": [ + "input_ids", + "pixel_values", + "attention_mask" + ], + "input_shapes": [ + [ + 10, + 77 + ], + [ + 1, + 3, + 224, + 224 + ], + [ + 10, + 77 + ] + ], + "input_types": [ + "int64", + "float32", + "int64" + ], + "output_names": [ + "logits_per_image", + "logits_per_text", + "text_embeds", + "image_embeds" + ], + "output_shapes": [ + [ + 1, + 10 + ], + [ + 10, + 1 + ], + [ + 10, + 512 + ], + [ + 1, + 512 + ] + ] + } + }, + "systems": { + "host_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] + }, + "target_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "metric_data_config", + "user_script": "user_script.py", + "load_dataset_config": { + "type": "clip_dataset", + "model_name": "openai/clip-vit-base-patch32", + "dataset_name": "nlphuji/flickr30k", + "start": 0, + "end": 10 + }, + "dataloader_config": { + "type": "no_auto_batch_dataloader" + }, + "post_process_data_config": { + "type": "clip_post_process" + } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "accuracy", + "type": "accuracy", + "backend": "huggingface_metrics", + "data_config": "metric_data_config", + "sub_types": [ + { + "name": "accuracy", + "priority": 1, + "goal": { + "type": "max-degradation", + "value": 0.05 + } + } + ] + }, + { + "name": "latency", + "type": "latency", + "data_config": "metric_data_config", + "sub_types": [ + { + "name": "avg", + "goal": { + "type": "percent-min-improvement", + "value": 0.1 + } + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + }, + { + "name": "throughput", + "type": "throughput", + "data_config": "metric_data_config", + "sub_types": [ + { + "name": "avg" + }, + { + "name": "max" + }, + { + "name": "min" + } + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "onnx_float_to_float16": { + "type": "OnnxFloatToFloat16", + "save_as_external_data": true + } + }, + "search_strategy": false, + "host": "host_system", + "target": "target_system", + "cache_dir": "cache", + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "output_dir": "model/clip" +} diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json.config b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json.config new file mode 100644 index 00000000..d17c25fa --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json.config @@ -0,0 +1,84 @@ +{ + "name": "Convert to WebGPU", + "addCpu": false, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.target_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "name": "Evaluate", + "phase": "Evaluation", + "parameters": [ + { + "name": "Evaluation Dataset", + "tags": [ + "EvaluationDataset" + ], + "type": "enum", + "path": "data_configs[0].load_dataset_config.dataset_name", + "values": [ + "nlphuji/flickr30k" + ], + "template": { + "path": "data_configs[0].load_dataset_config.dataset_name", + "values": [ + "nlphuji/flickr30k" + ], + "template": "EvaluationDataset" + } + }, + { + "name": "Evaluation Dataset Size", + "type": "int", + "path": "data_configs[0].load_dataset_config.end", + "template": { + "path": "data_configs[0].load_dataset_config.end", + "template": "EvaluationDatasetSize" + } + } + ], + "toggle": { + "autoGenerated": true, + "name": "Evaluate model performance", + "type": "bool", + "path": "evaluator", + "actions": [ + [], + [ + { + "type": "delete", + "path": "evaluator" + } + ] + ] + } + } + ] +} diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu_inference_sample.ipynb b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu_inference_sample.ipynb new file mode 100644 index 00000000..015cd8a1 --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu_inference_sample.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "aeb33f1a", + "metadata": {}, + "outputs": [], + "source": [ + "onnx_model_path = \"./model/model.onnx\"\n", + "ExecutionProvider=\"WebGpuExecutionProvider\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22477669", + "metadata": {}, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307fcca8", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + " \n", + "from transformers import CLIPProcessor\n", + "import onnxruntime as ort\n", + "import numpy as np\n", + "import torch\n", + " \n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\", use_fast=False)\n", + " \n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + " \n", + "inputs = processor(text=[\"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\", \"a photo of a cat\", \"a photo of a dog\"],\n", + " images=image, return_tensors=\"np\", padding=\"max_length\",\n", + " max_length= 77, truncation=True)\n", + " \n", + "\n", + "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n", + " ep_devices = ort.get_ep_devices()\n", + " for ep_device in ep_devices:\n", + " if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n", + " print(f\"Adding {ep_name} for {device_type}\")\n", + " session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n", + " break\n", + " \n", + "opts = ort.SessionOptions()\n", + " \n", + "add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.GPU)\n", + "assert opts.has_providers()\n", + "\n", + "# options = ort.SessionOptions()\n", + "session = ort.InferenceSession(onnx_model_path,\n", + " sess_options=opts,\n", + " # providers=[ExecutionProvider],\n", + " # provider_options=[provider_options]\n", + ")\n", + "logits_per_image = session.run([\"logits_per_image\"],\n", + " {\n", + " \"input_ids\": inputs['input_ids'].astype(np.int64),\n", + " \"attention_mask\": inputs['attention_mask'].astype(np.int64),\n", + " \"pixel_values\": inputs['pixel_values'].astype(np.float16)\n", + " })\n", + " \n", + "probs = torch.tensor(logits_per_image[0]).softmax(dim=1)\n", + "print(\"Label probs:\", probs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "winml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openai-clip-vit-large-patch14/aitk/inference_sample.ipynb b/openai-clip-vit-large-patch14/aitk/inference_sample.ipynb index 96024cd4..f58feec6 100644 --- a/openai-clip-vit-large-patch14/aitk/inference_sample.ipynb +++ b/openai-clip-vit-large-patch14/aitk/inference_sample.ipynb @@ -53,6 +53,34 @@ "register_execution_providers()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf6bb9b8", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n", + "import subprocess\n", + "import json\n", + "import sys\n", + "import os\n", + "import onnxruntime as ort\n", + "\n", + "def register_execution_providers():\n", + " worker_script = os.path.abspath('winml.py')\n", + " result = subprocess.check_output([sys.executable, worker_script], text=True)\n", + " paths = json.loads(result)\n", + " for item in paths.items():\n", + " ort.register_execution_provider_library(item[0], item[1])\n", + "\n", + "register_execution_providers()" + ] + }, { "cell_type": "code", "execution_count": null,