Skip to content

Commit ecb1ef1

Browse files
author
pytorchbot
committed
2024-09-20 nightly release (b5741a6)
1 parent 928bfa8 commit ecb1ef1

File tree

103 files changed

+1152
-936
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+1152
-936
lines changed

.ci/scripts/test_llama.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1111

1212
MODEL_NAME=$1 # stories110M
1313
BUILD_TOOL=$2 # buck2 or cmake
14-
DTYPE=$3 # fp16 or fp32
14+
DTYPE=$3 # fp16, bf16, or fp32
1515
MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
1616
UPLOAD_DIR=${5:-}
1717
if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
@@ -29,7 +29,7 @@ if [[ -z "${BUILD_TOOL:-}" ]]; then
2929
fi
3030

3131
if [[ -z "${DTYPE:-}" ]]; then
32-
echo "Missing dtype, choose fp16 or fp32, exiting..."
32+
echo "Missing dtype, choose fp16, bf16, or fp32, exiting..."
3333
exit 1
3434
fi
3535

@@ -174,6 +174,8 @@ fi
174174
EXPORTED_MODEL_NAME="llama2"
175175
if [[ "${DTYPE}" == "fp16" ]]; then
176176
EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_h"
177+
elif [[ "${DTYPE}" == "bf16" ]]; then
178+
EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_bf"
177179
elif [[ "${DTYPE}" == "fp32" ]]; then
178180
:
179181
else

.github/workflows/pull.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,13 @@ jobs:
9191
dtype: [fp32]
9292
build-tool: [buck2, cmake]
9393
mode: [portable, xnnpack+custom, xnnpack+custom+qe]
94+
include:
95+
- dtype: bf16
96+
build-tool: cmake
97+
mode: portable
98+
- dtype: bf16
99+
build-tool: buck2
100+
mode: portable
94101
fail-fast: false
95102
with:
96103
runner: linux.2xlarge

.github/workflows/trunk.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ jobs:
224224
matrix:
225225
dtype: [fp32]
226226
mode: [portable, xnnpack+kv+custom, mps, coreml]
227+
include:
228+
- dtype: bf16
229+
mode: portable
227230
fail-fast: false
228231
with:
229232
runner: macos-m1-stable
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -euo pipefail
9+
10+
unset CMAKE_PREFIX_PATH
11+
git submodule sync
12+
git submodule update --init
13+
./install_requirements.sh
14+
15+
rm -rf cmake-out
16+
17+
STEPWISE_BUILD=false
18+
19+
if $STEPWISE_BUILD; then
20+
echo "Building ExecuTorch"
21+
cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
22+
-DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
23+
-DCMAKE_BUILD_TYPE=Release \
24+
-DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
25+
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
26+
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
27+
-DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
28+
-DEXECUTORCH_BUILD_CPUINFO=OFF \
29+
-DEXECUTORCH_ENABLE_LOGGING=ON \
30+
-DEXECUTORCH_USE_DL=OFF \
31+
-DEXECUTORCH_BUILD_CADENCE=OFF \
32+
-DFLATC_EXECUTABLE="$(which flatc)" \
33+
-Bcmake-out .
34+
35+
echo "Building any Cadence-specific binaries on top"
36+
cmake -DBUCK2="$BUCK" \
37+
-DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
38+
-DCMAKE_INSTALL_PREFIX=cmake-out \
39+
-DCMAKE_BUILD_TYPE=Release \
40+
-DEXECUTORCH_BUILD_HOST_TARGETS=ON \
41+
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
42+
-DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
43+
-DEXECUTORCH_BUILD_CADENCE=ON \
44+
-DFLATC_EXECUTABLE="$(which flatc)" \
45+
-DEXECUTORCH_ENABLE_LOGGING=ON \
46+
-DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
47+
-DEXECUTORCH_USE_DL=OFF \
48+
-DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
49+
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
50+
-DPYTHON_EXECUTABLE=python3 \
51+
-DEXECUTORCH_NNLIB_OPT=ON \
52+
-DEXECUTORCH_BUILD_GFLAGS=ON \
53+
-DHAVE_FNMATCH_H=OFF \
54+
-Bcmake-out/backends/cadence \
55+
backends/cadence
56+
cmake --build cmake-out/backends/cadence -j16
57+
else
58+
echo "Building Cadence toolchain with ExecuTorch packages"
59+
cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
60+
cmake -DBUCK2="$BUCK" \
61+
-DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
62+
-DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
63+
-DCMAKE_INSTALL_PREFIX=cmake-out \
64+
-DCMAKE_BUILD_TYPE=Release \
65+
-DEXECUTORCH_BUILD_HOST_TARGETS=ON \
66+
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
67+
-DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
68+
-DEXECUTORCH_BUILD_CADENCE=OFF \
69+
-DFLATC_EXECUTABLE="$(which flatc)" \
70+
-DEXECUTORCH_ENABLE_LOGGING=ON \
71+
-DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
72+
-DEXECUTORCH_USE_DL=OFF \
73+
-DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
74+
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
75+
-DPYTHON_EXECUTABLE=python3 \
76+
-DEXECUTORCH_NNLIB_OPT=ON \
77+
-DEXECUTORCH_BUILD_GFLAGS=ON \
78+
-DHAVE_FNMATCH_H=OFF \
79+
-DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
80+
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
81+
-DEXECUTORCH_BUILD_CPUINFO=OFF \
82+
-Bcmake-out
83+
cmake --build cmake-out --target install --config Release -j16
84+
fi
85+
86+
echo "Run simple model to verify cmake build"
87+
python3 -m examples.portable.scripts.export --model_name="add"
88+
xt-run --turbo cmake-out/executor_runner --model_path=add.pte

backends/vulkan/runtime/api/containers/Tensor.cpp

Lines changed: 43 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,16 @@
1313
namespace vkcompute {
1414
namespace api {
1515

16-
/*
17-
* Given the strides of a buffer-backed tensor, estimate the equivalent memory
18-
* layout enum value by identifying the fastest moving dimension.
19-
*/
20-
utils::GPUMemoryLayout estimate_memory_layout(
21-
const std::vector<int64_t>& dim_order) {
22-
int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back();
23-
if (fastest_dim_whcn >= 0 && fastest_dim_whcn < 3) {
24-
return utils::GPUMemoryLayout(fastest_dim_whcn);
25-
}
26-
27-
// TODO(ssjia) find a way to gracefully recover from this case by i.e. adding
28-
// a UNKOWN GPUMemoryLayout. This is not high priority though because we don't
29-
// expect this to ever come up in practice.
30-
VK_THROW("No compatible GPUMemoryLayout value");
31-
}
32-
3316
std::vector<int64_t> calculate_dim_order(
3417
const size_t ndim,
35-
const utils::GPUMemoryLayout memory_layout) {
18+
const int32_t packed_dim) {
3619
// Special case for zero dim tensors
3720
if (ndim == 0) {
3821
return {0};
3922
}
4023
std::vector<int64_t> dim_order(ndim);
41-
int64_t last_dim =
42-
ndim - utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
24+
// Explicitly convert ndim to signed to prevent underflow
25+
int64_t last_dim = int64_t(ndim) - 1 - packed_dim;
4326

4427
int64_t cur_dim = 0;
4528
for (int d = 0; d < ndim; ++d) {
@@ -149,7 +132,7 @@ std::vector<int64_t> unsqueeze_strides(
149132

150133
std::vector<int64_t> calculate_padded_sizes(
151134
const std::vector<int64_t>& sizes,
152-
const utils::GPUMemoryLayout memory_layout) {
135+
const int32_t packed_dim) {
153136
int64_t ndim = sizes.size();
154137
if (ndim == 0) {
155138
ndim = 1;
@@ -163,8 +146,7 @@ std::vector<int64_t> calculate_padded_sizes(
163146
}
164147

165148
// Pad the packed dim to the next multiple of 4.
166-
const int64_t dim_offset =
167-
utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
149+
const int64_t dim_offset = packed_dim + 1;
168150
const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
169151
padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
170152

@@ -174,7 +156,7 @@ std::vector<int64_t> calculate_padded_sizes(
174156
utils::uvec3 calculate_image_extents(
175157
const std::vector<int64_t>& padded_sizes,
176158
const std::vector<int64_t>& axis_map,
177-
const utils::GPUMemoryLayout memory_layout) {
159+
const int32_t packed_dim) {
178160
VK_CHECK_COND(padded_sizes.size() == 4);
179161
VK_CHECK_COND(axis_map.size() == 4);
180162

@@ -195,21 +177,8 @@ utils::uvec3 calculate_image_extents(
195177
// Multiply the extents of the batch axis by the batch size.
196178
extents[batch_axis] *= padded_sizes.at(0);
197179

198-
switch (memory_layout) {
199-
case utils::kWidthPacked:
200-
VK_CHECK_COND(extents[axis_map.at(0)] % 4 == 0);
201-
extents[axis_map.at(0)] /= 4;
202-
break;
203-
case utils::kHeightPacked:
204-
VK_CHECK_COND(extents[axis_map.at(1)] % 4 == 0);
205-
extents[axis_map.at(1)] /= 4;
206-
break;
207-
case utils::kChannelsPacked:
208-
VK_CHECK_COND(extents[axis_map.at(2)] % 4 == 0);
209-
extents[axis_map.at(2)] /= 4;
210-
break;
211-
}
212-
180+
VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0);
181+
extents[axis_map.at(packed_dim)] /= 4;
213182
return extents;
214183
}
215184

@@ -285,15 +254,15 @@ vkapi::VulkanBuffer allocate_buffer(
285254
vTensorStorage::vTensorStorage(
286255
Context* const context,
287256
const utils::StorageType storage_type,
288-
const utils::GPUMemoryLayout gpu_memory_layout,
289257
const std::vector<int64_t>& axis_map,
258+
const int32_t packed_dim,
290259
const std::vector<int64_t>& padded_sizes,
291260
const vkapi::ScalarType dtype,
292261
const bool allocate_memory)
293262
: context_(context),
294263
storage_type_{storage_type},
295264
image_extents_(
296-
calculate_image_extents(padded_sizes, axis_map, gpu_memory_layout)),
265+
calculate_image_extents(padded_sizes, axis_map, packed_dim)),
297266
buffer_length_{utils::multiply_integers(padded_sizes)},
298267
buffer_offset_{0},
299268
image_(allocate_image(
@@ -408,14 +377,14 @@ vTensor::vTensor(
408377
const utils::GPUMemoryLayout memory_layout,
409378
const bool allocate_memory)
410379
: dtype_(dtype),
411-
memory_layout_(memory_layout),
412380
// Calculate tensor metadata
413381
sizes_(sizes.begin(), sizes.end()),
414-
dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
382+
packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
383+
dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
415384
axis_map_(default_axis_map()),
416385
strides_(calculate_strides(sizes, dim_order_)),
417386
numel_(utils::multiply_integers(sizes_)),
418-
padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
387+
padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
419388
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
420389
padded_numel_(utils::multiply_integers(padded_sizes_)),
421390
logical_limits_{{0, 0, 0}},
@@ -429,8 +398,8 @@ vTensor::vTensor(
429398
storage_(
430399
context,
431400
storage_type,
432-
memory_layout_,
433401
axis_map_,
402+
packed_dim_,
434403
padded_sizes_,
435404
dtype_,
436405
allocate_memory) {
@@ -451,9 +420,9 @@ vTensor::vTensor(
451420

452421
vTensor::vTensor(const vTensor& other)
453422
: dtype_(other.dtype_),
454-
memory_layout_(other.memory_layout_),
455423
// Copy tensor size metadata
456424
sizes_(other.sizes_.begin(), other.sizes_.end()),
425+
packed_dim_{other.packed_dim_},
457426
dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
458427
axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
459428
strides_(other.strides_.begin(), other.strides_.end()),
@@ -479,14 +448,14 @@ vTensor::vTensor(
479448
const std::vector<int64_t>& dim_order,
480449
const int64_t offset_numel)
481450
: dtype_(other.dtype_),
482-
memory_layout_(estimate_memory_layout(dim_order)),
483451
// Copy tensor size metadata
484452
sizes_(sizes.begin(), sizes.end()),
453+
packed_dim_(other.packed_dim_),
485454
dim_order_(dim_order.begin(), dim_order.end()),
486455
axis_map_(default_axis_map()),
487456
strides_(calculate_strides(sizes_, dim_order_)),
488457
numel_(utils::multiply_integers(sizes_)),
489-
padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
458+
padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
490459
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
491460
padded_numel_(utils::multiply_integers(padded_sizes_)),
492461
logical_limits_(other.logical_limits_),
@@ -542,6 +511,19 @@ void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
542511
logical_limits_.limits[2] = image_extents[axis_map_.at(2)];
543512
}
544513

514+
utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
515+
switch (packed_dim_) {
516+
case WHCN::kWidthDim:
517+
return utils::kWidthPacked;
518+
case WHCN::kHeightDim:
519+
return utils::kHeightPacked;
520+
case WHCN::kChannelsDim:
521+
return utils::kChannelsPacked;
522+
default:
523+
VK_THROW("Invalid packed dim");
524+
}
525+
}
526+
545527
const vkapi::BufferBindInfo vTensor::sizes_ubo() {
546528
if (!sizes_uniform_.buffer()) {
547529
sizes_uniform_ =
@@ -618,21 +600,16 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
618600

619601
void vTensor::update_metadata() {
620602
strides_ = calculate_strides(sizes_, dim_order_);
621-
// Only update the memory layout for buffer-backed tensors. Strides are
622-
// meaningless for texture-backed tensors and do not impact the memory layout.
623-
if (storage_type() == utils::kBuffer) {
624-
memory_layout_ = estimate_memory_layout(dim_order_);
625-
}
626603
numel_ = utils::multiply_integers(sizes_);
627604

628-
padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
605+
padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_);
629606
unsqueezed_strides_ = unsqueeze_strides(strides_, numel_);
630607
padded_numel_ = utils::multiply_integers(padded_sizes_);
631608

632609
// Calculate the image extents that would have been used to allocate a texture
633610
// withthe current sizes, and use that to set the logical limits.
634611
set_logical_limits(
635-
calculate_image_extents(padded_sizes_, axis_map_, memory_layout_));
612+
calculate_image_extents(padded_sizes_, axis_map_, packed_dim_));
636613

637614
if (sizes_uniform_.buffer()) {
638615
sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
@@ -656,7 +633,7 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
656633
// For texture storage check that the current texture is large enough for
657634
// the new sizes of the tensor.
658635
utils::uvec3 virtual_extents =
659-
calculate_image_extents(padded_sizes_, axis_map_, memory_layout_);
636+
calculate_image_extents(padded_sizes_, axis_map_, packed_dim_);
660637

661638
bool valid_resize = virtual_extents[0] <= storage_.image_extents_[0];
662639
valid_resize =
@@ -725,23 +702,23 @@ void transpose_dim_order_inplace(
725702

726703
void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
727704
std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1);
705+
706+
const int dim0_whcn = sizes_.size() - 1 - dim0;
707+
const int dim1_whcn = sizes_.size() - 1 - dim1;
708+
if (packed_dim_ == dim0_whcn) {
709+
packed_dim_ = dim1_whcn;
710+
}
711+
if (packed_dim_ == dim1_whcn) {
712+
packed_dim_ = dim0_whcn;
713+
}
714+
728715
if (storage_type() == utils::kBuffer) {
729716
transpose_dim_order_inplace(dim_order_, dim0, dim1);
730717
} else {
731-
const int dim0_whcn = sizes_.size() - 1 - dim0;
732-
const int dim1_whcn = sizes_.size() - 1 - dim1;
733718
// Cannot transpose batch dimension for texture storage
734719
VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3);
735-
736720
std::iter_swap(
737721
axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn);
738-
739-
if (packed_dim_whcn_idx() == dim0_whcn) {
740-
memory_layout_ = utils::GPUMemoryLayout(dim1_whcn);
741-
}
742-
if (packed_dim_whcn_idx() == dim1_whcn) {
743-
memory_layout_ = utils::GPUMemoryLayout(dim0_whcn);
744-
}
745722
}
746723
update_metadata();
747724
}

0 commit comments

Comments
 (0)