Merge branch 'release/0.7' into cherry-pick-12452-by-pytorch_bot_bot_

psiddh · web-flow · commit f5335326a4a6 · 2025-07-16T14:44:06.000-07:00
diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit 84096dd536edffd19337d9297634c4f5c5449bfd
+Subproject commit 52208356940a7c7d3597cf386d500a0f776f7bd0
diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl
@@ -274,6 +274,38 @@ def define_xnnpack():
         ],
     )
 
+    SSE2_FMA_COMPILER_FLAGS = [
+        "-msse2",
+        "-mno-sse3",
+    ]
+
+    native.cxx_library(
+        name = "ukernels_sse2fma",
+        srcs = select({
+            "DEFAULT": prod_srcs_for_arch_wrapper("sse2fma"),
+            "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
+            "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
+        }),
+        headers = get_xnnpack_headers(),
+        header_namespace = "",
+        compiler_flags = [
+            "-O2",
+            "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
+        ] + select({
+            "DEFAULT": SSE2_FMA_COMPILER_FLAGS,
+            "ovr_config//cpu:arm32": [],
+            "ovr_config//cpu:arm64": [],
+        }),
+        preferred_linkage = "static",
+        preprocessor_flags = [
+            "-DXNN_LOG_LEVEL=0",
+        ],
+        exported_deps = [
+            ":FP16",
+            ":interface",
+        ],
+    )
+
     SSE3_COMPILER_FLAGS = ["-mssse3"]
 
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
@@ -961,6 +993,44 @@ def define_xnnpack():
         ],
     )
 
+    AMD64_COMPILER_FLAGS = [
+        "-mf16c",
+        "-mfma",
+        "-mavx512f",
+        "-mavx512cd",
+        "-mavx512bw",
+        "-mavx512dq",
+        "-mavx512vl",
+        "-mavx512vnni",
+        "-mgfni",
+    ]
+    native.cxx_library(
+        name = "ukernels_amd64",
+        srcs = select({
+            "DEFAULT": prod_srcs_for_arch_wrapper("amd64"),
+            "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
+            "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
+        }),
+        headers = get_xnnpack_headers(),
+        header_namespace = "",
+        compiler_flags = [
+            "-O2",
+            "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
+        ] + select({
+            "DEFAULT": AMD64_COMPILER_FLAGS,
+            "ovr_config//cpu:arm32": [],
+            "ovr_config//cpu:arm64": [],
+        }),
+        preferred_linkage = "static",
+        preprocessor_flags = [
+            "-DXNN_LOG_LEVEL=0",
+        ],
+        exported_deps = [
+            ":FP16",
+            ":interface",
+        ],
+    )
+
     AVX512VNNIGFNI_COMPILER_FLAGS = AVX512VNNI_COMPILER_FLAGS + [
         "-mgfni",
     ]
@@ -1044,12 +1114,14 @@ def define_xnnpack():
         ":ukernels_fma3",
         ":ukernels_sse",
         ":ukernels_sse2",
+        ":ukernels_sse2fma",
         ":ukernels_sse41",
         ":ukernels_ssse3",
         ":ukernels_avx512vbmi",
         ":ukernels_avx512vnnigfni",
         ":ukernels_avx512vnni",
         ":ukernels_avxvnni",
+        ":ukernels_amd64",
     ]
 
     ARM_XNNPACK_DEPS = [
@@ -1097,10 +1169,22 @@ def define_xnnpack():
             "-DXNN_ENABLE_GEMM_M_SPECIALIZATION",
             "-DXNN_ENABLE_ARM_DOTPROD",
             "-DXNN_ENABLE_CPUINFO",
-            # "-DXNN_ENABLE_DWCONV_MULTIPLASS=1",
+            # "-DXNN_ENABLE_DWCONV_MULTIPLASS=0",
             "-DXNN_ENABLE_ARM_I8MM=1",
             "-DXNN_ENABLE_ARM_FP16_VECTOR=1",
-            "-DXNN_ENABLE_AVX512BF16=0"
+            "-DXNN_ENABLE_AVX512F=1",
+            "-DXNN_ENABLE_AVX512SKX=1",
+            "-DXNN_ENABLE_AVX512VNNI=1",
+            "-DXNN_ENABLE_AVX512VBMI=1",
+            "-DXNN_ENABLE_AVXVNNI=0",
+            "-DXNN_ENABLE_AVXVNNIINT8=0",
+            "-DXNN_ENABLE_AVX512FP16=0",
+            "-DXNN_ENABLE_AVX512VNNIGFNI=0",
+            "-DXNN_ENABLE_AVX512BF16=0",
+            "-DXNN_ENABLE_AVX256VNNIGFNI=0",
+            "-DXNN_ENABLE_AVX512AMX=0",
+            "-DXNN_ENABLE_AVX256SKX=0",
+            "-DXNN_ENABLE_AVX256VNNI=0",
         ],
         visibility = ["PUBLIC"],
         exported_deps = COMMON_XNNPACK_DEPS + [
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
@@ -52,22 +52,25 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
     auto numel = sizes[0];
     std::vector<::executorch::aten::SizesType> sizes_vec = {numel};
 
-    // Assuming the last dimension is the one with the variable token length,
-    // for example [1, S] or [1, 1, S]
-    sizes_vec[sizes_vec.size() - 1] = numel;
     TensorPtr start_pos_tensor;
     if (numel > 1) {
-      // Assuming model is exported with cache_positions, create a tensor with
-      // the same size as cache_positions
+      // If we are here, model is exported with cache_positions, create a tensor
+      // with the same length as input_ids. Assuming the last dimension is the
+      // one with the variable token length, for example [1, S] or [1, 1, S]
+      sizes_vec[sizes_vec.size() - 1] = tokens->numel();
       start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long);
       torch::executor::native::arange_out_impl(
-          start_pos, start_pos + numel, 1.0, *start_pos_tensor);
+          start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor);
     } else {
       // Assuming model is exported with input_pos, create a tensor with size 1
       start_pos_tensor = from_blob(
           &start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
     }
-    ET_LOG(Info, "Start pos tensor numel: %zu", start_pos_tensor->numel());
+    ET_LOG(
+        Info,
+        "Start pos tensor numel: %zu, tokens numel: %zu",
+        start_pos_tensor->numel(),
+        tokens->numel());
     auto outputs_res = module_->forward({tokens, start_pos_tensor});
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
@@ -21,7 +21,7 @@ class ET_EXPERIMENTAL TextPrefiller {
  public:
   TextPrefiller(
       TextDecoderRunner* text_decoder_runner,
-      bool use_kv_cache_,
+      bool use_kv_cache,
       bool enable_parallel_prefill,
       int64_t max_seq_len = 128);