2024-09-21 nightly release (0eee42a)

pytorchbot · pytorchbot · commit b0310739af9b · 2024-09-21T11:35:11.000Z
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -10,19 +10,16 @@
 
 #define PRECISION ${PRECISION}
 
+#include "indexing_utils.h"
+
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 
-layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs {
-  ivec3 range;
-  int unused0;
-  ivec3 src_offset;
-  int unused1;
-  ivec3 dst_offset;
-  int unused2;
-};
+${layout_declare_ubo(B, "ivec3", "range", "ivec3", "src_offset", "ivec3", "dst_offset")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "in_axis_map")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -36,5 +33,9 @@ void main() {
     return;
   }
 
-  imageStore(t_out, out_pos, texelFetch(t_in, in_pos, 0));
+  write_texel_lpos(
+    t_out,
+    out_pos,
+    load_texel_lpos(t_in, in_pos, in_axis_map),
+    out_axis_map);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -33,19 +33,13 @@ void add_copy_offset_node(
   add_dtype_suffix(kernel_name, *t_out);
 
   const struct Block final {
-    ivec3 range;
-    int32_t unused0;
-    ivec3 src_offset;
-    int32_t unused1;
-    ivec3 dst_offset;
-    int32_t unused2;
+    alignas(16) ivec3 range;
+    alignas(16) ivec3 src_offset;
+    alignas(16) ivec3 dst_offset;
   } offset_params{
       range,
-      0,
       src_offset,
-      0,
       dst_offset,
-      0,
   };
 
   auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -61,7 +55,11 @@ void add_copy_offset_node(
           {in, vkapi::MemoryAccessType::READ},
       },
       // Parameter buffers
-      {graph.create_params_buffer(offset_params)},
+      {
+          graph.create_params_buffer(offset_params),
+          t_out->axis_map_ubo(),
+          t_in->axis_map_ubo(),
+      },
       // Specialization Constants
       {}));
 }
diff --git a/docs/source/apple-runtime.md b/docs/source/apple-runtime.md
@@ -19,6 +19,19 @@ Link your binary with the ExecuTorch runtime and any backends or kernels used by
 
 ## Integration
 
+### Setup
+
+#### CMake
+
+Building the Xcode project requires CMake. Installing via homebrew does not
+typically work; instead, install the packaged application and commandline tools
+globally:
+
+1. Download the macOS `.dmg` installer from https://cmake.org/download
+2. Open the `.dmg`
+3. Drag the CMake app to the `/Applications` folder
+4. In a terminal, install the command line tools: `sudo /Applications/CMake.app/Contents/bin/cmake-gui --install`
+
 ### Swift Package Manager
 
 The prebuilt ExecuTorch runtime, backend, and kernels are available as a [Swift PM](https://www.swift.org/documentation/package-manager/) package.
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -45,10 +45,23 @@ char* model_pte = nullptr;
 #include "model_pte.h"
 #endif
 
-using namespace exec_aten;
-using namespace std;
-using torch::executor::Error;
-using torch::executor::Result;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::BufferCleanup;
+using executorch::extension::BufferDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::Tag;
+using executorch::runtime::TensorInfo;
 
 #define METHOD_ALLOCATOR_POOL_SIZE (70 * 1024 * 1024)
 unsigned char __attribute__((
@@ -86,11 +99,10 @@ void et_pal_emit_log_message(
 }
 
 namespace {
-using namespace torch::executor;
 
-Result<util::BufferCleanup> prepare_input_tensors(
+Result<BufferCleanup> prepare_input_tensors(
     Method& method,
-    torch::executor::MemoryAllocator& allocator,
+    MemoryAllocator& allocator,
     std::vector<std::pair<char*, size_t>>& input_buffers) {
   MethodMeta method_meta = method.method_meta();
   size_t num_inputs = method_meta.num_inputs();
@@ -175,18 +187,18 @@ Result<util::BufferCleanup> prepare_input_tensors(
       ET_LOG(
           Error, "Failed to prepare input %zu: 0x%" PRIx32, i, (uint32_t)err);
       // The BufferCleanup will free the inputs when it goes out of scope.
-      util::BufferCleanup cleanup({inputs, num_allocated});
+      BufferCleanup cleanup({inputs, num_allocated});
       return err;
     }
   }
-  return util::BufferCleanup({inputs, num_allocated});
+  return BufferCleanup({inputs, num_allocated});
 }
 
 #ifdef SEMIHOSTING
 
 std::pair<char*, size_t> read_binary_file(
     const char* filename,
-    torch::executor::MemoryAllocator& allocator) {
+    MemoryAllocator& allocator) {
   FILE* fp = fopen(filename, "rb");
   if (!fp) {
     ET_LOG(
@@ -238,13 +250,13 @@ int main(int argc, const char* argv[]) {
   (void)argv;
 #endif
 
-  torch::executor::runtime_init();
+  executorch::runtime::runtime_init();
   std::vector<std::pair<char*, size_t>> input_buffers;
   size_t pte_size = sizeof(model_pte);
 
 #ifdef SEMIHOSTING
   const char* output_basename = nullptr;
-  torch::executor::MemoryAllocator input_allocator(
+  MemoryAllocator input_allocator(
       input_allocation_pool_size, input_allocation_pool);
 
   /* parse input parameters */
@@ -277,10 +289,9 @@ int main(int argc, const char* argv[]) {
   }
 #endif
   ET_LOG(Info, "Model in %p %c", model_pte, model_pte[0]);
-  auto loader = torch::executor::util::BufferDataLoader(model_pte, pte_size);
+  auto loader = BufferDataLoader(model_pte, pte_size);
   ET_LOG(Info, "Model PTE file loaded. Size: %lu bytes.", pte_size);
-  Result<torch::executor::Program> program =
-      torch::executor::Program::load(&loader);
+  Result<Program> program = Program::load(&loader);
   if (!program.ok()) {
     ET_LOG(
         Info,
@@ -299,8 +310,7 @@ int main(int argc, const char* argv[]) {
   }
   ET_LOG(Info, "Running method %s", method_name);
 
-  Result<torch::executor::MethodMeta> method_meta =
-      program->method_meta(method_name);
+  Result<MethodMeta> method_meta = program->method_meta(method_name);
   if (!method_meta.ok()) {
     ET_LOG(
         Info,
@@ -309,13 +319,11 @@ int main(int argc, const char* argv[]) {
         (unsigned int)method_meta.error());
   }
 
-  torch::executor::MemoryAllocator method_allocator{
-      torch::executor::MemoryAllocator(
-          METHOD_ALLOCATOR_POOL_SIZE, method_allocation_pool)};
+  MemoryAllocator method_allocator(
+      METHOD_ALLOCATOR_POOL_SIZE, method_allocation_pool);
 
   std::vector<uint8_t*> planned_buffers; // Owns the memory
-  std::vector<torch::executor::Span<uint8_t>>
-      planned_spans; // Passed to the allocator
+  std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
   size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
 
   for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
@@ -330,17 +338,16 @@ int main(int argc, const char* argv[]) {
     planned_spans.push_back({planned_buffers.back(), buffer_size});
   }
 
-  torch::executor::HierarchicalAllocator planned_memory(
+  HierarchicalAllocator planned_memory(
       {planned_spans.data(), planned_spans.size()});
 
-  torch::executor::MemoryAllocator temp_allocator(
+  MemoryAllocator temp_allocator(
       temp_allocation_pool_size, temp_allocation_pool);
 
-  torch::executor::MemoryManager memory_manager(
+  MemoryManager memory_manager(
       &method_allocator, &planned_memory, &temp_allocator);
 
-  Result<torch::executor::Method> method =
-      program->load_method(method_name, &memory_manager);
+  Result<Method> method = program->load_method(method_name, &memory_manager);
   if (!method.ok()) {
     ET_LOG(
         Info,
@@ -379,7 +386,7 @@ int main(int argc, const char* argv[]) {
     ET_LOG(Info, "Model executed successfully.");
   }
 
-  std::vector<torch::executor::EValue> outputs(method->outputs_size());
+  std::vector<EValue> outputs(method->outputs_size());
   ET_LOG(Info, "%zu outputs: ", outputs.size());
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
@@ -112,7 +112,7 @@ Before continuing forward, make sure to modify the tokenizer, token embedding, a
 Prior to deploying the files on device, make sure to modify the tokenizer, token embedding, and model file names in  examples/mediatek/executor_runner/run_llama3_sample.sh reflect what was generated during the Export Llama Model step.
 
 <p align="center">
-<img src="../screenshots/mtk_changes_to_shell_file.png" width=600>
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/mtk_changes_to_shell_file.png" width=600>
 </p>
 
 In addition, create a sample_prompt.txt file with a prompt. This will be deployed to the device in the next step.
@@ -150,7 +150,7 @@ adb shell
 ```
 
 <p align="center">
-<img src="../screenshots/mtk_output.png" width=800>
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/mtk_output.png" width=800>
 </p>
 
 ## Reporting Issues
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -221,7 +221,7 @@ popd
 If the app successfully run on your device, you should see something like below:
 
 <p align="center">
-<img src="https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/screenshots/opening_the_app_details.png" width=800>
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" width=800>
 </p>
 
 ## Reporting Issues
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -149,7 +149,7 @@ popd
 If the app successfully run on your device, you should see something like below:
 
 <p align="center">
-<img src="../screenshots/opening_the_app_details.png" width=800>
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" width=800>
 </p>
 
 ## Reporting Issues
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -95,19 +95,19 @@ Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.
 For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
 
 <p align="center">
-<img src="../screenshots/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" width="600">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" width="600">
 </p>
 
 Then select which ExecuTorch framework should link against which target.
 
 <p align="center">
-<img src="../screenshots/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" width="600">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" width="600">
 </p>
 
 Click “Run” to build the app and run in on your iPhone. If the app successfully run on your device, you should see something like below:
 
 <p align="center">
-<img src="../screenshots/ios_demo_app_mps.jpg" alt="iOS LLaMA App mps" width="300">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_mps.jpg" alt="iOS LLaMA App mps" width="300">
 </p>
 
 ## Reporting Issues
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -95,25 +95,25 @@ Note: To access logs, link against the Debug build of the ExecuTorch runtime, i.
 For more details integrating and Running ExecuTorch on Apple Platforms, checkout this [link](https://pytorch.org/executorch/main/apple-runtime.html).
 
 <p align="center">
-<img src="../screenshots/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" width="600">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_swift_pm.png" alt="iOS LLaMA App Swift PM" width="600">
 </p>
 
 Then select which ExecuTorch framework should link against which target.
 
 <p align="center">
-<img src="../screenshots/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" width="600">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_choosing_package.png" alt="iOS LLaMA App Choosing package" width="600">
 </p>
 
 Click “Run” to build the app and run in on your iPhone. If the app successfully run on your device, you should see something like below:
 
 <p align="center">
-<img src="../screenshots/ios_demo_app.jpg" alt="iOS LLaMA App" width="300">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app.jpg" alt="iOS LLaMA App" width="300">
 </p>
 
 For Llava 1.5 models, you can select and image (via image/camera selector button) before typing prompt and send button.
 
 <p align="center">
-<img src="../screenshots/ios_demo_app_llava.jpg" alt="iOS LLaMA App" width="300">
+<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/ios_demo_app_llava.jpg" alt="iOS LLaMA App" width="300">
 </p>
 
 ## Reporting Issues
diff --git a/examples/models/phi-3-mini/main.cpp b/examples/models/phi-3-mini/main.cpp
@@ -42,7 +42,7 @@ int main(int32_t argc, char** argv) {
 
   int32_t seq_len = FLAGS_seq_len;
 
-  ::torch::executor::Runner runner(model_path, tokenizer_path, temperature);
+  example::Runner runner(model_path, tokenizer_path, temperature);
 
   runner.generate(prompt, seq_len);
 
diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp
@@ -15,6 +15,12 @@
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/platform/log.h>
 
+using executorch::aten::ScalarType;
+using executorch::extension::Module;
+using executorch::extension::llm::BPETokenizer;
+using executorch::extension::llm::Sampler;
+using executorch::runtime::Error;
+
 namespace example {
 
 #define SAMPLER_TOP 0.9f
diff --git a/examples/models/phi-3-mini/runner.h b/examples/models/phi-3-mini/runner.h
@@ -42,9 +42,9 @@ class Runner {
   uint64_t prefill(std::vector<uint64_t>& tokens);
   uint64_t run_model_step(uint64_t token);
 
-  std::unique_ptr<Module> module_;
-  std::unique_ptr<Tokenizer> tokenizer_;
-  std::unique_ptr<Sampler> sampler_;
+  std::unique_ptr<executorch::extension::Module> module_;
+  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
 };
 
 } // namespace example
diff --git a/kernels/optimized/blas/BlasKernel.cpp b/kernels/optimized/blas/BlasKernel.cpp
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl