pytorch
diff --git a/‎.github/workflows/lint.yml‎
Lines changed: 22 additions & 13 deletions b/‎.github/workflows/lint.yml‎
Lines changed: 22 additions & 13 deletions
diff --git a/‎backends/aoti/common_shims.cpp‎
Lines changed: 12 additions & 0 deletions b/‎backends/aoti/common_shims.cpp‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/aoti/common_shims.h‎
Lines changed: 3 additions & 0 deletions b/‎backends/aoti/common_shims.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/aoti/utils.h‎
Lines changed: 6 additions & 0 deletions b/‎backends/aoti/utils.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/arm/test/misc/test_debug_feats.py‎
Lines changed: 5 additions & 2 deletions b/‎backends/arm/test/misc/test_debug_feats.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/cuda/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎backends/cuda/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cuda/cuda_backend.py‎
Lines changed: 3 additions & 1 deletion b/‎backends/cuda/cuda_backend.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/cuda/runtime/TARGETS‎
Lines changed: 8 additions & 0 deletions b/‎backends/cuda/runtime/TARGETS‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎backends/cuda/runtime/shims/int4mm.cu‎
Lines changed: 59 additions & 0 deletions b/‎backends/cuda/runtime/shims/int4mm.cu‎
Lines changed: 59 additions & 0 deletions
@@ -143,19 +143,28 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
-        FILES_NEEDS_FORMAT=$(/opt/google-java-format -n \
-          extension/android/executorch_android/src/main/java/org/pytorch/executorch/*.java \
-          extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/*.java \
-          extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/*.java \
-          extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/*.java \
-          extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java \
-          extension/benchmark/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/*.java)
+        FILES_NEEDS_FORMAT=$(find extension/android/executorch_android/src/main/java/org/pytorch/executorch \
+                            extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm \
+                            extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations \
+                            extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch \
+                            extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench \
+                            extension/benchmark/android/benchmark/app/src/androidTest/java/org/pytorch/minibench \
+                            -type f -name "*.java" 2>/dev/null | \
+                            xargs -r /opt/google-java-format -n)
+
         if [ -n "$FILES_NEEDS_FORMAT" ]; then
-          echo "Warning: The following files need formatting. Please use google-java-format."
-          echo "Use a binary from https://github.com/google/google-java-format/releases/"
-          echo "For example:"
-          echo "wget https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64"
-          echo "chmod +x google-java-format_linux-x86-64"
-          echo "./google-java-format_linux-x86-64 -i $FILES_NEEDS_FORMAT"
+          echo "Warning: The following files need formatting:"
+          echo "$FILES_NEEDS_FORMAT"
+          echo ""
+          echo "Please use google-java-format from https://github.com/google/google-java-format/releases/"
+          echo ""
+          echo "To fix, run one of these commands:"
+          echo "  # Using xargs (recommended):"
+          echo "  find <paths> -type f -name '*.java' | xargs google-java-format -i"
+          echo ""
+          echo "  # Or format specific files:"
+          echo "$FILES_NEEDS_FORMAT" | while IFS= read -r file; do
+            echo "  google-java-format -i \"$file\""
+          done
           exit 1
         fi
@@ -172,6 +172,18 @@ int32_t aoti_torch_dtype_bfloat16() {
   return 15; // PyTorch's bfloat16 dtype code
 }
 
+int32_t aoti_torch_dtype_int8() {
+  return 1; // PyTorch's int32 dtype code
+}
+
+int32_t aoti_torch_dtype_int16() {
+  return 2; // PyTorch's int32 dtype code
+}
+
+int32_t aoti_torch_dtype_int32() {
+  return 3; // PyTorch's int32 dtype code
+}
+
 int32_t aoti_torch_dtype_int64() {
   return 4; // PyTorch's int64 dtype code
 }
 
@@ -59,6 +59,9 @@ int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_layout_strided();
 int32_t aoti_torch_dtype_float32();
 int32_t aoti_torch_dtype_bfloat16();
+int32_t aoti_torch_dtype_int8();
+int32_t aoti_torch_dtype_int16();
+int32_t aoti_torch_dtype_int32();
 int32_t aoti_torch_dtype_int64();
 
 // Dtype utility function needed by Metal backend
 
@@ -34,6 +34,12 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
   // Convert based on known PyTorch dtype codes (without CUDA-specific
   // dependency)
   switch (dtype) {
+    case 1: // PyTorch's int8 dtype code
+      return executorch::aten::ScalarType::Char;
+    case 2: // PyTorch's int16 dtype code
+      return executorch::aten::ScalarType::Short;
+    case 3: // PyTorch's int32 dtype code
+      return executorch::aten::ScalarType::Int;
     case 4: // PyTorch's int64 dtype code
       return executorch::aten::ScalarType::Long;
     case 6: // PyTorch's float32 dtype code
 
@@ -21,6 +21,7 @@
     TosaPipelineFP,
     TosaPipelineINT,
 )
+from executorch.backends.test.harness.stages import StageType
 
 
 input_t1 = Tuple[torch.Tensor]  # Input x
@@ -104,7 +105,7 @@ def test_INT_artifact(test_data: input_t1):
 
 @common.parametrize("test_data", Linear.inputs)
 def test_numerical_diff_print(test_data: input_t1):
-    pipeline = TosaPipelineFP[input_t1](
+    pipeline = TosaPipelineINT[input_t1](
         Linear(),
         test_data,
         [],
@@ -119,7 +120,9 @@ def test_numerical_diff_print(test_data: input_t1):
     # not present.
     try:
         # Tolerate 0 difference => we want to trigger a numerical diff
-        tester.run_method_and_compare_outputs(atol=0, rtol=0, qtol=0)
+        tester.run_method_and_compare_outputs(
+            stage=StageType.INITIAL_MODEL, atol=0, rtol=0, qtol=0
+        )
     except AssertionError:
         pass  # Implicit pass test
     else:
 
@@ -38,7 +38,7 @@ find_package_torch()
 set(_aoti_cuda_sources
     runtime/cuda_backend.cpp runtime/shims/memory.cpp
     runtime/shims/tensor_attribute.cpp runtime/guard.cpp
-    runtime/shims/cuda_guard.cpp
+    runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu
 )
 add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 target_include_directories(
 
@@ -33,7 +33,9 @@
 }
 
 # exist fallback operators in et namespace;
-supported_fallback_kernels: Dict[str, Any] = {}
+supported_fallback_kernels: Dict[str, Any] = {
+    "at::_ops::_weight_int4pack_mm::call": None,
+}
 
 # required fallback kernels but not supported
 missing_fallback_kernels: Set[str] = set()
 
@@ -1,4 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("//tools/build/buck:nvcc_flags.bzl", "get_nvcc_arch_args")
 
 oncall("executorch")
 
@@ -7,12 +8,15 @@ runtime.cxx_library(
     srcs = [
         "guard.cpp",
         "shims/cuda_guard.cpp",
+        "shims/int4mm.cu",
         "shims/memory.cpp",
         "shims/tensor_attribute.cpp",
     ],
     headers = [
         "guard.h",
         "shims/cuda_guard.h",
+        "shims/int4mm.cuh",
+        "shims/int4mm.h",
         "shims/memory.h",
         "shims/tensor_attribute.h",
         "utils.h",
@@ -30,6 +34,10 @@ runtime.cxx_library(
         "//executorch/runtime/core/exec_aten:lib",
         "//executorch/runtime/platform:platform",
     ],
+    nvcc_flags = get_nvcc_arch_args() + [
+        "-_NVCC_HOST_COMPILER_FLAG_",
+        "gcc",
+    ],
     external_deps = [
         ("cuda", None, "cuda-lazy"),
     ],
 
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <executorch/backends/aoti/utils.h>
+#include <executorch/backends/cuda/runtime/shims/int4mm.h>
+#include <executorch/backends/cuda/runtime/shims/int4mm.cuh>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::backends::cuda {
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTITorchError aoti_torch_cuda__weight_int4pack_mm(
+    Tensor* self,
+    Tensor* mat2,
+    int64_t qGroupSize,
+    Tensor* qScaleAndZeros,
+    Tensor** ret0) {
+  // Validate input parameters first
+  // Only check for null pointers here, as the actual validation of tensor
+  // properties is done in _weight_int4pack_mm_cuda
+  ET_CHECK_OR_RETURN_ERROR(
+      self != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda__weight_int4pack_mm failed: self tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      mat2 != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda__weight_int4pack_mm failed: mat2 tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      qScaleAndZeros != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda__weight_int4pack_mm failed: qScaleAndZeros tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      ret0 != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda__weight_int4pack_mm failed: ret0 is null");
+
+  *ret0 = _weight_int4pack_mm_cuda(*self, *mat2, qGroupSize, *qScaleAndZeros);
+  ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR();
+  return Error::Ok;
+}
+
+#ifdef __cplusplus
+}
+#endif
+} // namespace executorch::backends::cuda
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ find_package_torch()`
`38`	`38`	`set(_aoti_cuda_sources`
`39`	`39`	`runtime/cuda_backend.cpp runtime/shims/memory.cpp`
`40`	`40`	`runtime/shims/tensor_attribute.cpp runtime/guard.cpp`
`41`		`- runtime/shims/cuda_guard.cpp`
	`41`	`+ runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu`
`42`	`42`	`)`
`43`	`43`	`add_library(aoti_cuda STATIC ${_aoti_cuda_sources})`
`44`	`44`	`target_include_directories(`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,9 @@`
`33`	`33`	`}`
`34`	`34`
`35`	`35`	`# exist fallback operators in et namespace;`
`36`		`-supported_fallback_kernels: Dict[str, Any] = {}`
	`36`	`+supported_fallback_kernels: Dict[str, Any] = {`
	`37`	`+ "at::_ops::_weight_int4pack_mm::call": None,`
	`38`	`+}`
`37`	`39`
`38`	`40`	`# required fallback kernels but not supported`
`39`	`41`	`missing_fallback_kernels: Set[str] = set()`