diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh index 3c173b0ea2a..b628af7b694 100755 --- a/.ci/scripts/export_model_artifact.sh +++ b/.ci/scripts/export_model_artifact.sh @@ -51,9 +51,10 @@ fi set -eux DEVICE="$1" -HF_MODEL="$2" -QUANT_NAME="${3:-non-quantized}" -OUTPUT_DIR="${4:-.}" +DTYPE="$2" +HF_MODEL="$3" +QUANT_NAME="${4:-non-quantized}" +OUTPUT_DIR="${5:-.}" case "$DEVICE" in cuda) @@ -67,6 +68,18 @@ case "$DEVICE" in ;; esac +case "$DTYPE" in + float16) + ;; + bfloat16) + ;; + *) + echo "Error: Unsupported dtype '$DTYPE'" + echo "Supported dtypes: float16, bfloat16" + exit 1 + ;; +esac + # Determine model configuration based on HF model ID case "$HF_MODEL" in mistralai/Voxtral-Mini-3B-2507) @@ -155,7 +168,7 @@ optimum-cli export executorch \ --model "$HF_MODEL" \ --task "$TASK" \ --recipe "$DEVICE" \ - --dtype bfloat16 \ + --dtype "$DTYPE" \ ${DEVICE_ARG} \ ${MAX_SEQ_LEN_ARG} \ ${EXTRA_ARGS} \ diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index c66d3621ea1..cd513de18a5 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -142,7 +142,7 @@ jobs: pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} echo "::endgroup::" - source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" + source .ci/scripts/export_model_artifact.sh cuda bfloat16 "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" test-model-cuda-e2e: name: test-model-cuda-e2e diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml index 92351883e8f..01bb446432d 100644 --- a/.github/workflows/metal.yml +++ b/.github/workflows/metal.yml @@ -44,6 +44,9 @@ jobs: name: "whisper-small" - repo: "openai" name: "whisper-large-v3-turbo" + dtype: + - "float16" + - "bfloat16" quant: - "non-quantized" with: @@ -53,7 +56,7 @@ jobs: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN - upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }} + upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.dtype }}-${{ matrix.quant }} script: | set -eux @@ -76,7 +79,7 @@ jobs: ${CONDA_RUN} pip list echo "::endgroup::" - ${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" + ${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.dtype }}" "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" test-model-metal-e2e: name: test-model-metal-e2e @@ -92,6 +95,9 @@ jobs: name: "whisper-small" - repo: "openai" name: "whisper-large-v3-turbo" + dtype: + - "float16" + - "bfloat16" quant: - "non-quantized" with: @@ -100,7 +106,7 @@ jobs: submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 - download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }} + download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.dtype }}-${{ matrix.quant }} script: | set -eux diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index abfde86db6d..0661d327f20 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -170,6 +170,10 @@ int32_t aoti_torch_dtype_float32() { return 6; // PyTorch's float32 dtype code } +int32_t aoti_torch_dtype_float16() { + return 5; // PyTorch's float16 dtype code +} + int32_t aoti_torch_dtype_bfloat16() { return 15; // PyTorch's bfloat16 dtype code } diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 675a9864e74..95825eadedd 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -63,6 +63,7 @@ aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu(); AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided(); AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32(); +AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float16(); AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16(); AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8(); AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16(); diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h index 8f64bdbe7da..ddef8cfd94a 100644 --- a/backends/aoti/utils.h +++ b/backends/aoti/utils.h @@ -43,6 +43,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) { return executorch::aten::ScalarType::Int; case 4: // PyTorch's int64 dtype code return executorch::aten::ScalarType::Long; + case 5: // PyTorch's float16 dtype code + return executorch::aten::ScalarType::Half; case 6: // PyTorch's float32 dtype code return executorch::aten::ScalarType::Float; case 11: // PyTorch's bool dtype code diff --git a/backends/apple/metal/runtime/shims/et_metal_ops.mm b/backends/apple/metal/runtime/shims/et_metal_ops.mm index b150c68fe6d..a323cc7f225 100644 --- a/backends/apple/metal/runtime/shims/et_metal_ops.mm +++ b/backends/apple/metal/runtime/shims/et_metal_ops.mm @@ -224,12 +224,15 @@ AOTITorchError aoti_torch_mps_mm_out( MPSDataType mps_dtype; size_t element_size; - ET_LOG(Debug, "aoti_torch_mps_mm_out: self_tensor scalar_type=%d, SupportedDTypes::FLOAT32=%d, SupportedDTypes::BFLOAT16=%d", - dtype, static_cast(SupportedDTypes::FLOAT32), static_cast(SupportedDTypes::BFLOAT16)); + ET_LOG(Debug, "aoti_torch_mps_mm_out: self_tensor scalar_type=%d, SupportedDTypes::FLOAT32=%d, SupportedDTypes::FLOAT16=%d, SupportedDTypes::BFLOAT16=%d", + dtype, static_cast(SupportedDTypes::FLOAT32), static_cast(SupportedDTypes::FLOAT16), static_cast(SupportedDTypes::BFLOAT16)); if (dtype == static_cast(SupportedDTypes::FLOAT32)) { mps_dtype = MPSDataTypeFloat32; element_size = sizeof(float); + } else if (dtype == static_cast(SupportedDTypes::FLOAT16)) { + mps_dtype = MPSDataTypeFloat16; + element_size = sizeof(uint16_t); // half is 16 bits } else if (dtype == static_cast(SupportedDTypes::BFLOAT16)) { mps_dtype = MPSDataTypeBFloat16; element_size = sizeof(uint16_t); // bfloat16 is 16 bits @@ -592,6 +595,9 @@ AOTITorchError aoti_torch_mps_convolution( if (dtype == static_cast(SupportedDTypes::FLOAT32)) { mps_dtype = MPSDataTypeFloat32; element_size = sizeof(float); + } else if (dtype == static_cast(SupportedDTypes::FLOAT16)) { + mps_dtype = MPSDataTypeFloat16; + element_size = sizeof(uint16_t); // half is 16 bits } else if (dtype == static_cast(SupportedDTypes::BFLOAT16)) { mps_dtype = MPSDataTypeBFloat16; element_size = sizeof(uint16_t); // bfloat16 is 16 bits @@ -1084,6 +1090,9 @@ AOTITorchError aoti_torch_mps__scaled_dot_product_attention_math_for_mps( if (dtype == static_cast(SupportedDTypes::FLOAT32)) { mps_dtype = MPSDataTypeFloat32; element_size = sizeof(float); + } else if (dtype == static_cast(SupportedDTypes::FLOAT16)) { + mps_dtype = MPSDataTypeFloat16; + element_size = sizeof(uint16_t); // half is 16 bits } else if (dtype == static_cast(SupportedDTypes::BFLOAT16)) { mps_dtype = MPSDataTypeBFloat16; element_size = sizeof(uint16_t); // bfloat16 is 16 bits diff --git a/backends/apple/metal/runtime/shims/utils.cpp b/backends/apple/metal/runtime/shims/utils.cpp index 061360a4e28..b24aa49d7b8 100644 --- a/backends/apple/metal/runtime/shims/utils.cpp +++ b/backends/apple/metal/runtime/shims/utils.cpp @@ -21,6 +21,7 @@ bool is_dtype_supported_in_et_metal(int32_t dtype) { switch (dtype) { case static_cast(SupportedDTypes::INT64): case static_cast(SupportedDTypes::FLOAT32): + case static_cast(SupportedDTypes::FLOAT16): case static_cast(SupportedDTypes::BFLOAT16): return true; default: @@ -40,6 +41,7 @@ AOTITorchError validate_dtype(int32_t dtype) { dtype, static_cast(SupportedDTypes::INT64), static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDTypes::FLOAT16), static_cast(SupportedDTypes::BFLOAT16)); return Error::InvalidArgument; } diff --git a/backends/apple/metal/runtime/shims/utils.h b/backends/apple/metal/runtime/shims/utils.h index 974832fa365..9c1b2449aaa 100644 --- a/backends/apple/metal/runtime/shims/utils.h +++ b/backends/apple/metal/runtime/shims/utils.h @@ -24,7 +24,7 @@ enum class SupportedDTypes : int32_t { // INT16 = 2, // PyTorch's int16 dtype code // INT32 = 3, // PyTorch's int32 dtype code INT64 = 4, // PyTorch's int64 dtype code - // FLOAT16 = 5, // PyTorch's float16 dtype code + FLOAT16 = 5, // PyTorch's float16 dtype code FLOAT32 = 6, // PyTorch's float32 dtype code // FLOAT64 = 7, // PyTorch's float64 dtype code // BOOL = 11, // PyTorch's bool dtype code diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index 4f2523989c1..6848f272344 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -192,7 +192,24 @@ Result> AsrRunner::transcribe( Info, "Conversion complete, first value = %f", static_cast( - preprocessed_features->mutable_data_ptr()[0])); + preprocessed_features + ->mutable_data_ptr<::executorch::aten::BFloat16>()[0])); + } else if (expected_dtype == ::executorch::aten::ScalarType::Half) { + ET_LOG( + Info, + "Converting audio features from %s to Float16 (Half). Before converting, first value = %f", + ::executorch::runtime::toString(preprocessed_features->scalar_type()), + preprocessed_features->mutable_data_ptr()[0]); + auto convert_result = ::executorch::extension::llm::convert_to_float16( + preprocessed_features); + ET_CHECK_OK_OR_RETURN_ERROR(convert_result.error()); + preprocessed_features = convert_result.get(); + ET_LOG( + Info, + "Conversion complete, first value = %f", + static_cast( + preprocessed_features + ->mutable_data_ptr<::executorch::aten::Half>()[0])); } } diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h index e87d625f140..66de249b90f 100644 --- a/extension/llm/runner/util.h +++ b/extension/llm/runner/util.h @@ -174,6 +174,30 @@ convert_to_bfloat16(const ::executorch::extension::TensorPtr& src_tensor) { return bf16_tensor; } +/** + * Helper function to convert a float tensor to float16 (Half). + * Creates a new tensor with Half dtype and copies/converts the data. + */ +inline ::executorch::runtime::Result<::executorch::extension::TensorPtr> +convert_to_float16(const ::executorch::extension::TensorPtr& src_tensor) { + ET_CHECK_OR_RETURN_ERROR( + src_tensor->scalar_type() == ::executorch::aten::ScalarType::Float, + InvalidArgument, + "Float16 conversion only supported from Float source data"); + + const auto num_elements = static_cast(src_tensor->numel()); + const float* float_data = src_tensor->const_data_ptr(); + + auto f16_tensor = ::executorch::extension::empty_like( + src_tensor, ::executorch::aten::ScalarType::Half); + auto* f16_data = f16_tensor->mutable_data_ptr<::executorch::aten::Half>(); + for (size_t i = 0; i < num_elements; ++i) { + f16_data[i] = ::executorch::aten::Half(float_data[i]); + } + + return f16_tensor; +} + } // namespace llm } // namespace extension } // namespace executorch