Skip to content

Commit c4ced87

Browse files
committed
Update base for Update on "Export lora weights to sep file"
Differential Revision: [D83777195](https://our.internmc.facebook.com/intern/diff/D83777195/) [ghstack-poisoned]
2 parents 16126fc + 2eb8994 commit c4ced87

File tree

98 files changed

+3294
-591
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+3294
-591
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
bd06b54e627fbfd354a2cffa4c80fb21883209a9
1+
44d8d54e38c0258357d4e92e1fefe21e845947a3

.github/workflows/cuda.yml

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,87 @@ jobs:
8686
PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
8787
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
8888
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
89+
90+
test-voxtral-cuda-e2e:
91+
name: test-voxtral-cuda-e2e
92+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
93+
permissions:
94+
id-token: write
95+
contents: read
96+
secrets: inherit
97+
strategy:
98+
fail-fast: false
99+
with:
100+
timeout: 90
101+
secrets-env: EXECUTORCH_HF_TOKEN
102+
runner: linux.g5.4xlarge.nvidia.gpu
103+
gpu-arch-type: cuda
104+
gpu-arch-version: 12.6
105+
use-custom-docker-registry: false
106+
submodules: recursive
107+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
108+
script: |
109+
set -eux
110+
111+
echo "::group::Setup ExecuTorch"
112+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
113+
echo "::endgroup::"
114+
115+
echo "::group::Setup Huggingface"
116+
pip install -U "huggingface_hub[cli]" accelerate
117+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
118+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
119+
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
120+
pip install mistral-common librosa
121+
echo "::endgroup::"
122+
123+
echo "::group::Export Voxtral"
124+
optimum-cli export executorch \
125+
--model "mistralai/Voxtral-Mini-3B-2507" \
126+
--task "multimodal-text-to-text" \
127+
--recipe "cuda" \
128+
--dtype bfloat16 \
129+
--device cuda \
130+
--max_seq_len 1024 \
131+
--output_dir ./
132+
echo "::endgroup::"
133+
134+
echo "::group::Build Voxtral Runner"
135+
cmake -DCMAKE_BUILD_TYPE=Release \
136+
-DEXECUTORCH_BUILD_CUDA=ON \
137+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
138+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
139+
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
140+
-DEXECUTORCH_BUILD_TESTS=ON \
141+
-Bcmake-out .
142+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
143+
echo "::endgroup::"
144+
145+
echo "::group::Run Voxtral Runner"
146+
# Capture output and allow exit code 139 if we have the expected printout
147+
set +e
148+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
149+
OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
150+
EXIT_CODE=$?
151+
set -e
152+
153+
echo "$OUTPUT"
154+
155+
# Check if the output contains "Run latency (ms):"
156+
if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
157+
echo "Found expected output: 'Run latency (ms):'"
158+
if [ $EXIT_CODE -eq 139 ]; then
159+
echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
160+
exit 0
161+
elif [ $EXIT_CODE -ne 0 ]; then
162+
echo "Unexpected exit code: $EXIT_CODE"
163+
exit $EXIT_CODE
164+
else
165+
echo "Command succeeded with exit code 0"
166+
exit 0
167+
fi
168+
else
169+
echo "Expected output 'Run latency (ms):' not found in output"
170+
exit 1
171+
fi
172+
echo "::endgroup::"

CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,18 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
266266
executorch_move_interface_include_directories_to_build_time_only(
267267
pthreadpool_interface
268268
)
269+
270+
if(APPLE)
271+
# Use hidden visibility for pthreadpool on Apple platforms to avoid issues
272+
# with pthreadpool symbols from libtorch_cpu taking precedence over the ones
273+
# from the pthreadpool library statically linked in _portable_lib. The
274+
# pthreadpool public APIs are marked as weak by default on some Apple
275+
# platforms, so setting to hidden visibility works around this by not
276+
# putting the symbol in the indirection table. See
277+
# https://github.com/pytorch/executorch/issues/14321 for more details.
278+
target_compile_options(pthreadpool PRIVATE -fvisibility=hidden)
279+
endif()
280+
269281
install(
270282
TARGETS pthreadpool pthreadpool_interface fxdiv
271283
EXPORT ExecuTorchTargets

backends/aoti/CMakeLists.txt

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,8 @@ target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
4040
# Ensure symbols are exported properly
4141
target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
4242

43-
# Link against PyTorch libraries and standard libraries
44-
target_link_libraries(
45-
aoti_common
46-
PUBLIC extension_tensor ${CMAKE_DL_LIBS}
47-
# Link PyTorch libraries for AOTI functions
48-
${TORCH_LIBRARIES}
49-
)
43+
# Link against ExecuTorch libraries and standard libraries
44+
target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
5045
executorch_target_link_options_shared_lib(aoti_common)
5146

5247
install(

backends/aoti/aoti_model_container.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ struct AOTIDelegateHandle {
7777
void* so_handle;
7878
std::string so_path;
7979
AOTInductorModelContainerHandle container_handle;
80+
void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
81+
// dependency
8082
};
8183

8284
} // namespace aoti

backends/aoti/common_shims.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,11 +127,18 @@ int32_t aoti_torch_layout_strided() {
127127
}
128128

129129
// Dtype constants - these return the PyTorch dtype codes
130-
// Currently only float32 is supported, but using robust enum-based approach
131130
int32_t aoti_torch_dtype_float32() {
132131
return 6; // PyTorch's float32 dtype code
133132
}
134133

134+
int32_t aoti_torch_dtype_bfloat16() {
135+
return 15; // PyTorch's bfloat16 dtype code
136+
}
137+
138+
int32_t aoti_torch_dtype_int64() {
139+
return 4; // PyTorch's int64 dtype code
140+
}
141+
135142
// Cleanup functions
136143
void cleanup_tensor_metadata() {
137144
internal::tensor_to_sizes.clear();

backends/aoti/common_shims.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
5858
int32_t aoti_torch_device_type_cpu();
5959
int32_t aoti_torch_layout_strided();
6060
int32_t aoti_torch_dtype_float32();
61+
int32_t aoti_torch_dtype_bfloat16();
62+
int32_t aoti_torch_dtype_int64();
6163

6264
// Autograd mode functions
6365
int32_t aoti_torch_grad_mode_is_enabled();

backends/aoti/targets.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def define_common_targets():
5151
link_whole = True,
5252
supports_python_dlopen = True,
5353
visibility = ["@EXECUTORCH_CLIENTS"],
54-
deps = [
54+
exported_deps = [
5555
":common_shims",
5656
":model_container",
5757
],

backends/aoti/utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
3434
// Convert based on known PyTorch dtype codes (without CUDA-specific
3535
// dependency)
3636
switch (dtype) {
37+
case 4: // PyTorch's int64 dtype code
38+
return executorch::aten::ScalarType::Long;
3739
case 6: // PyTorch's float32 dtype code
3840
return executorch::aten::ScalarType::Float;
3941
case 15: // PyTorch's bfloat16 dtype code

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
ReplaceScalarWithTensorArgPassTOSABI,
9292
ReplaceScalarWithTensorArgPassTOSAMI,
9393
)
94+
from .rewrite_matmul import RewriteMatmulPass # noqa
9495
from .rewrite_upsample import RewriteUpsamplePass # noqa
9596
from .scalars_to_attribute_pass import ScalarsToAttributePass # noqa
9697
from .size_adjust_input_pass import SizeAdjustInputPass # noqa

0 commit comments

Comments
 (0)