pytorch
diff --git a/‎.ci/scripts/unittest-buck2.sh‎
Lines changed: 2 additions & 2 deletions b/‎.ci/scripts/unittest-buck2.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 2 additions & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎CODEOWNERS‎
Lines changed: 12 additions & 12 deletions b/‎CODEOWNERS‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎Test.cmake‎
Lines changed: 0 additions & 1 deletion b/‎Test.cmake‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎build/cmake_deps.toml‎
Lines changed: 4 additions & 18 deletions b/‎build/cmake_deps.toml‎
Lines changed: 4 additions & 18 deletions
diff --git a/‎build/executorch-config.cmake‎
Lines changed: 1 addition & 7 deletions b/‎build/executorch-config.cmake‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎examples/models/llama/runner/static_attention_io_manager.h‎
Lines changed: 16 additions & 2 deletions b/‎examples/models/llama/runner/static_attention_io_manager.h‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎examples/models/llama/static_attention.py‎
Lines changed: 66 additions & 1 deletion b/‎examples/models/llama/static_attention.py‎
Lines changed: 66 additions & 1 deletion
diff --git a/‎examples/models/llama/tests/test_static_attention.py‎
Lines changed: 31 additions & 25 deletions b/‎examples/models/llama/tests/test_static_attention.py‎
Lines changed: 31 additions & 25 deletions
@@ -20,5 +20,5 @@ buck2 query "//backends/apple/... + //backends/example/... + \
 # TODO: expand the covered scope of Buck targets.
 # //runtime/kernel/... is failing because //third-party:torchgen_files's shell script can't find python on PATH.
 # //runtime/test/... requires Python torch, which we don't have in our OSS buck setup.
-buck2 build //runtime/backend/... //runtime/core/... //runtime/executor: //runtime/kernel/... //runtime/platform/...
-buck2 test //runtime/backend/... //runtime/core/... //runtime/executor: //runtime/kernel/... //runtime/platform/...
+buck2 test //kernels/portable/... //runtime/backend/... //runtime/core/... \
+      //runtime/executor: //runtime/kernel/... //runtime/platform/...
@@ -218,6 +218,8 @@ exclude_patterns = [
     'examples/**',
     'extension/**',
     'kernels/optimized/**',
+    # Justified <functional> include.
+    'runtime/kernel/thread_parallel_interface.h',
     'scripts/**',
     'third-party/**',
     'util/**',
 
@@ -751,7 +751,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL
    AND EXECUTORCH_BUILD_CPUINFO
 )
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel)
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
 
@@ -52,31 +52,31 @@
 /extension/export_util @kimishpatel
 /extension/flat_tensor @lucylq
 /extension/gguf_util @larryliu0820
-/extension/kernel_util @kimishpatel @manuelcandales
-/extension/llm @jackzhxng @iseeyuan @larryliu0820
-/extension/memory_allocator @JacobSzwejbka
+/extension/kernel_util @kimishpatel @manuelcandales @swolchok
+/extension/llm @jackzhxng @iseeyuan @larryliu0820 @swolchok
+/extension/memory_allocator @JacobSzwejbka @swolchok
 /extension/module @shoumikhin
-/extension/parallel @kimishpatel
+/extension/parallel @kimishpatel @swolchok
 /extension/pybindings @JacobSzwejbka @larryliu0820
-/extension/pytree @JacobSzwejbka
-# /extension/runner_util @dbort
+/extension/pytree @JacobSzwejbka @swolchok
+/extension/runner_util @swolchok
 /extension/tensor @shoumikhin
-# /extension/testing_util @dbort
-/extension/threadpool @kimishpatel
+/extension/testing_util @swolchok
+/extension/threadpool @kimishpatel @swolchok
 /extension/training @JacobSzwejbka
 
-/kernels @manuelcandales
+/kernels @manuelcandales @swolchok
 
 /profiler @tarun292 @Gasoonjia
 
-/runtime @JacobSzwejbka @lucylq
+/runtime @JacobSzwejbka @lucylq @swolchok
 /runtime/backend @cccclai
 
 /schema @JacobSzwejbka @lucylq
 
-/scripts @GregoryComer
+/scripts @GregoryComer @swolchok
 
-/shim @larryliu0820 @GregoryComer
+/shim @larryliu0820 @GregoryComer @swolchok
 
 /third-party @GregoryComer
 
 
@@ -13,7 +13,6 @@ if(BUILD_TESTING)
   add_subdirectory(extension/evalue_util/test)
   add_subdirectory(extension/kernel_util/test)
   add_subdirectory(extension/memory_allocator/test)
-  add_subdirectory(extension/parallel/test)
   add_subdirectory(extension/pytree/test)
   add_subdirectory(kernels/portable/cpu/util/test)
   add_subdirectory(kernels/prim_ops/test)
 
@@ -88,7 +88,6 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
-  "extension_parallel",
   "extension_threadpool",
   "portable_kernels",
 ]
@@ -131,7 +130,7 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
-  "extension_parallel",
+  "extension_threadpool",
 ]
 
 [targets.optimized_native_cpu_ops]
@@ -146,7 +145,6 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
-  "extension_parallel",
   "extension_threadpool",
   "portable_kernels",
 ]
@@ -227,19 +225,6 @@ deps = [
   "extension_runner_util",
 ]
 
-[targets.extension_parallel]
-buck_targets = [
-  "//extension/parallel:thread_parallel",
-]
-filters = [
-  ".cpp$",
-]
-deps = [
-  "executorch",
-  "executorch_core",
-  "extension_threadpool",
-]
-
 [targets.extension_tensor]
 buck_targets = [
   "//extension/tensor:tensor",
@@ -379,6 +364,7 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
   "xnnpack_backend",
   "portable_kernels",
 ]
@@ -393,6 +379,7 @@ filters = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
 ]
 
 [targets.xnnpack_schema]
@@ -427,7 +414,6 @@ deps = [
   "executorch",
   "executorch_core",
   "optimized_kernels",
-  "extension_parallel",
   "extension_threadpool",
   "reduce_util",
   "xnnpack_backend",
@@ -465,7 +451,7 @@ deps = [
   "executorch_core",
   "extension_data_loader",
   "extension_module",
-  "extension_parallel",
+  "extension_threadpool",
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",
 
@@ -75,7 +75,6 @@ set(lib_list
     custom_ops
     extension_module
     extension_module_static
-    extension_parallel
     extension_runner_util
     extension_tensor
     extension_threadpool
@@ -131,14 +130,9 @@ endforeach()
 
 # TODO: investigate use of install(EXPORT) to cleanly handle
 # target_compile_options/target_compile_definitions for everything.
-if(TARGET extension_parallel)
-  set_target_properties(
-    extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
-  )
-endif()
 if(TARGET cpublas)
   set_target_properties(
-    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel
+    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
   )
 endif()
 if(TARGET extension_threadpool)
 
@@ -38,6 +38,11 @@ class StaticKVCache {
     reset();
   }
 
+  StaticKVCache(const StaticKVCache& other) = delete;
+  StaticKVCache& operator=(const StaticKVCache& other) = delete;
+  StaticKVCache(StaticKVCache&& other) = delete;
+  StaticKVCache& operator=(StaticKVCache&& other) = delete;
+
   ~StaticKVCache() {
     allocator_.deallocate(data_, data_size_);
   }
@@ -200,6 +205,15 @@ class StaticAttentionMask {
     reset();
   }
 
+  StaticAttentionMask(const StaticAttentionMask& other) = delete;
+  StaticAttentionMask& operator=(const StaticAttentionMask& other) = delete;
+  StaticAttentionMask(StaticAttentionMask&& other) = delete;
+  StaticAttentionMask& operator=(StaticAttentionMask&& other) = delete;
+
+  ~StaticAttentionMask() {
+    allocator_.deallocate(data_, data_size_);
+  }
+
   /**
    * Reset the mask to the state where the cache contains no valid data.
    */
@@ -315,7 +329,7 @@ class StaticAttentionIOManager {
     input_pos_ += update_len;
     kCaches_.update(method, k_cache_output_indices, update_len);
     vCaches_.update(method, v_cache_output_indices, update_len);
-    for (auto it : attentionMasks_) {
+    for (auto& it : attentionMasks_) {
       it.second.updateCacheMask(update_len);
     }
   }
@@ -324,7 +338,7 @@ class StaticAttentionIOManager {
     input_pos_ = 0;
     kCaches_.reset();
     vCaches_.reset();
-    for (auto it : attentionMasks_) {
+    for (auto& it : attentionMasks_) {
       it.second.reset();
     }
   }
 
@@ -210,6 +210,7 @@ def __init__(self, config: ModelArgs, layer_id: int, rope: Rope):
         self.inv_scale = 1.0 / (float(self.head_dim) ** 0.5)
         self.attention_qkv_bias = config.attention_qkv_bias
         self.use_qk_norm = config.use_qk_norm
+        self.use_conv2d = False
 
         assert not self.use_qk_norm, "QK norm not supported in static attention yet"
         self.wqs = nn.ModuleList(
@@ -255,9 +256,25 @@ def forward(
         in_cache_state = kwargs.get("in_cache_state")
         out_cache_state = kwargs.get("out_cache_state")
 
+        bsz, seq_len, dim = x.shape
+        if self.use_conv2d:
+            x = x.reshape(bsz, seq_len, 1, dim).transpose(1, 3)
+
         new_qs = [self.wqs[i](x) for i in range(self.n_heads)]
         new_ks = [self.wks[i](x) for i in range(self.n_kv_heads)]
         new_vs = [self.wvs[i](x) for i in range(self.n_kv_heads)]
+
+        if self.use_conv2d:
+
+            def from_conv2ds(ts):
+                return [
+                    t.reshape(bsz, self.head_dim, seq_len).transpose(1, 2) for t in ts
+                ]
+
+            new_qs = from_conv2ds(new_qs)
+            new_ks = from_conv2ds(new_ks)
+            new_vs = from_conv2ds(new_vs)
+
         new_qs = [self.rope(q, freqs_cos, freqs_sin) for q in new_qs]
         new_ks = [self.rope(k, freqs_cos, freqs_sin) for k in new_ks]
         all_ks = []
@@ -282,7 +299,14 @@ def forward(
             heads.append(attn @ all_vs[kv_idx])
 
         y = torch.cat(heads, dim=-1)
-        y = self.wo(y)
+        if self.use_conv2d:
+            y = (
+                self.wo(y.reshape(bsz, seq_len, 1, -1).transpose(1, 3))
+                .transpose(1, 3)
+                .reshape(bsz, seq_len, -1)
+            )
+        else:
+            y = self.wo(y)
         return y, {"out_cache_state": out_cache_state}
 
     def load_weights_from_attention_mha(self, other: AttentionMHA):
@@ -300,3 +324,44 @@ def load_weights_from_attention_mha(self, other: AttentionMHA):
             )
 
         self.wo.weight.data.copy_(other.wo.weight)
+
+    def linear_to_conv2d(self):
+        def transfer_weight(linear, conv2d):
+            conv2d.weight.data.copy_(linear.weight[:, :, None, None])
+            return conv2d
+
+        self.wqs = nn.ModuleList(
+            [
+                transfer_weight(
+                    linear,
+                    nn.Conv2d(self.dim, self.head_dim, 1, bias=self.attention_qkv_bias),
+                )
+                for linear in self.wqs
+            ]
+        )
+        self.wks = nn.ModuleList(
+            [
+                transfer_weight(
+                    linear,
+                    nn.Conv2d(self.dim, self.head_dim, 1, bias=self.attention_qkv_bias),
+                )
+                for linear in self.wks
+            ]
+        )
+        self.wvs = nn.ModuleList(
+            [
+                transfer_weight(
+                    linear,
+                    nn.Conv2d(self.dim, self.head_dim, 1, bias=self.attention_qkv_bias),
+                )
+                for linear in self.wvs
+            ]
+        )
+        self.wo = transfer_weight(
+            self.wo,
+            nn.Conv2d(
+                self.n_heads * self.head_dim, self.dim, 1, bias=self.attention_qkv_bias
+            ),
+        )
+
+        self.use_conv2d = True
@@ -17,32 +17,38 @@ def setUp(self):
         torch.manual_seed(42)
 
     def test_without_cache(self):
-        config = ModelArgs(
-            dim=64,
-            n_heads=4,
-            n_kv_heads=2,
-            max_seq_len=8,
-        )
-        layer_id = 0
-        rope = Rope(config)
-        attn_mha = AttentionMHA(config, layer_id, rope).eval()
-        static_attn = StaticAttention(config, layer_id, rope).eval()
-        static_attn.load_weights_from_attention_mha(attn_mha)
+        def test(use_conv2d):
+            config = ModelArgs(
+                dim=64,
+                n_heads=4,
+                n_kv_heads=2,
+                max_seq_len=8,
+            )
+            layer_id = 0
+            rope = Rope(config)
+            attn_mha = AttentionMHA(config, layer_id, rope).eval()
+            static_attn = StaticAttention(config, layer_id, rope).eval()
+            static_attn.load_weights_from_attention_mha(attn_mha)
+            if use_conv2d:
+                static_attn.linear_to_conv2d()
+
+            x = torch.rand(1, config.max_seq_len, config.dim)
+            freqs_cos, freqs_sin = rope.get_freqs(None, config.max_seq_len)
+            expected, _ = attn_mha(x, freqs_cos, freqs_sin)
+            mask = torch.triu(
+                torch.full((1, config.max_seq_len, config.max_seq_len), float("-inf")),
+                diagonal=1,
+            )
+            y, _ = static_attn(
+                x,
+                freqs_cos,
+                freqs_sin,
+                mask=mask,
+            )
+            self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
 
-        x = torch.rand(1, config.max_seq_len, config.dim)
-        freqs_cos, freqs_sin = rope.get_freqs(None, config.max_seq_len)
-        expected, _ = attn_mha(x, freqs_cos, freqs_sin)
-        mask = torch.triu(
-            torch.full((1, config.max_seq_len, config.max_seq_len), float("-inf")),
-            diagonal=1,
-        )
-        y, _ = static_attn(
-            x,
-            freqs_cos,
-            freqs_sin,
-            mask=mask,
-        )
-        self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
+        test(True)
+        test(False)
 
     def test_hf_rope_without_cache(self):
         config = ModelArgs(
Original file line number	Diff line number	Diff line change
`@@ -751,7 +751,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL`
`751`	`751`	`AND EXECUTORCH_BUILD_CPUINFO`
`752`	`752`	`)`
`753`	`753`	`add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)`
`754`		`- add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel)`
`755`	`754`	`endif()`
`756`	`755`
`757`	`756`	`if(EXECUTORCH_BUILD_PYBIND)`