Skip to content

Commit 2e26e77

Browse files
authored
Merge branch 'main' into Arm-backend-ArmTester-support-testing-with-portable-ops
2 parents 2f528f5 + 4675292 commit 2e26e77

File tree

240 files changed

+2815
-2082
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

240 files changed

+2815
-2082
lines changed

.ci/scripts/setup-windows-msvc.ps1

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
conda create --yes --quiet -n et python=3.12
2+
conda activate et
3+
4+
# Install cmake
5+
conda install -y cmake
6+
7+
# Activate the VS environment - this is required for MSVC to work
8+
# There are a bunch of environment variables that it requires.
9+
# See https://learn.microsoft.com/en-us/cpp/build/building-on-the-command-line.
10+
& "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64
11+
12+
# Install CI requirements
13+
pip install -r .ci/docker/requirements-ci.txt
14+
15+
# Create build directory
16+
$buildDir = "cmake-out-msvc"
17+
if (Test-Path -Path $buildDir) {
18+
Remove-Item -Path $buildDir -Recurse -Force
19+
}
20+
New-Item -Path $buildDir -ItemType Directory
21+
22+
# Configure CMake with MSVC (not ClangCL) and disable custom/quantized ops
23+
cmake -S . -B $buildDir `
24+
-DCMAKE_BUILD_TYPE=Release `
25+
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON `
26+
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON `
27+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON `
28+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON `
29+
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON `
30+
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON `
31+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON `
32+
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF `
33+
-DEXECUTORCH_BUILD_KERNELS_CUSTOM_AOT=OFF `
34+
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=OFF `
35+
-DEXECUTORCH_BUILD_XNNPACK=ON `
36+
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON `
37+
-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON
38+
39+
if ($LASTEXITCODE -ne 0) {
40+
Write-Host "CMake configuration failed. Exit code: $LASTEXITCODE."
41+
exit $LASTEXITCODE
42+
}
43+
44+
# Build with MSVC
45+
cmake --build $buildDir --config Release -j16
46+
47+
if ($LASTEXITCODE -ne 0) {
48+
Write-Host "Build failed. Exit code: $LASTEXITCODE."
49+
exit $LASTEXITCODE
50+
}
51+
52+
Write-Host "MSVC build completed successfully!"

.github/workflows/cuda.yml

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ jobs:
8989
9090
export-voxtral-cuda-artifact:
9191
name: export-voxtral-cuda-${{ matrix.quant.name }}
92+
# Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
93+
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
9294
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
9395
permissions:
9496
id-token: write
@@ -166,6 +168,8 @@ jobs:
166168
167169
export-gemma3-cuda-artifact:
168170
name: export-gemma3-cuda-${{ matrix.quant.name }}
171+
# Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
172+
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
169173
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
170174
permissions:
171175
id-token: write
@@ -176,12 +180,12 @@ jobs:
176180
matrix:
177181
quant:
178182
- name: "non-quantized"
179-
artifact: "voxtral-cuda-export"
183+
artifact: "gemma3-cuda-export"
180184
extra_args: ""
181-
# TODO: enable gemma3 quantization
182-
# - name: "quantized-int4-tile-packed"
183-
# artifact: "voxtral-cuda-quantized-int4-tile-packed"
184-
# extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
185+
- name: "quantized-int4-tile-packed"
186+
artifact: "gemma3-cuda-quantized-int4-tile-packed"
187+
extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
188+
# TODO: enable int4-weight-only on gemma3.
185189
# - name: "quantized-int4-weight-only"
186190
# artifact: "voxtral-cuda-quantized-int4-weight-only"
187191
# # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
@@ -194,7 +198,7 @@ jobs:
194198
gpu-arch-version: 12.6
195199
use-custom-docker-registry: false
196200
submodules: recursive
197-
upload-artifact: gemma3-cuda-export
201+
upload-artifact: ${{ matrix.quant.artifact }}
198202
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
199203
script: |
200204
set -eux
@@ -255,7 +259,7 @@ jobs:
255259
set -eux
256260
257261
echo "::group::Setup ExecuTorch Requirements"
258-
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
262+
./install_requirements.sh
259263
pip list
260264
echo "::endgroup::"
261265
@@ -305,7 +309,7 @@ jobs:
305309
set -eux
306310
307311
echo "::group::Setup ExecuTorch Requirements"
308-
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
312+
./install_requirements.sh
309313
pip list
310314
echo "::endgroup::"
311315
@@ -363,7 +367,7 @@ jobs:
363367
set -eux
364368
365369
echo "::group::Setup ExecuTorch Requirements"
366-
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
370+
./install_requirements.sh
367371
pip list
368372
echo "::endgroup::"
369373
@@ -435,9 +439,9 @@ jobs:
435439
format:
436440
- name: "non-quantized"
437441
artifact: "gemma3-cuda-export"
438-
# TODO: enable quantized gemma3.
439-
# - name: "quantized-int4-tile-packed"
440-
# artifact: "gemma3-cuda-quantized-int4-tile-packed"
442+
- name: "quantized-int4-tile-packed"
443+
artifact: "gemma3-cuda-quantized-int4-tile-packed"
444+
# TODO: enable int4-weight-only on gemma3.
441445
# - name: "quantized-int4-weight-only"
442446
# artifact: "gemma3-cuda-quantized-int4-weight-only"
443447
with:

.github/workflows/windows-msvc.yml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
name: Windows MSVC Build
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
- release/*
8+
tags:
9+
- ciflow/trunk/*
10+
pull_request:
11+
paths:
12+
- .ci/docker/ci_commit_pins/pytorch.txt
13+
- .ci/scripts/**
14+
workflow_dispatch:
15+
16+
concurrency:
17+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
18+
cancel-in-progress: true
19+
20+
jobs:
21+
build-windows-msvc:
22+
name: build-windows-msvc
23+
uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
24+
with:
25+
submodules: 'recursive'
26+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
27+
timeout: 60
28+
script: |
29+
conda init powershell
30+
powershell -Command "& {
31+
Set-PSDebug -Trace 1
32+
\$ErrorActionPreference = 'Stop'
33+
\$PSNativeCommandUseErrorActionPreference = \$true
34+
.ci/scripts/setup-windows-msvc.ps1
35+
}"

.mypy.ini

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,12 @@ ignore_missing_imports = True
8383
[mypy-tosa_tools.*]
8484
ignore_missing_imports = True
8585

86+
[mypy-tosa_serializer]
87+
ignore_missing_imports = True
88+
89+
[mypy-tosa_serializer.*]
90+
ignore_missing_imports = True
91+
8692
[mypy-setuptools.*]
8793
ignore_missing_imports = True
8894

backends/apple/metal/metal_backend.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929

3030
# exist fallback operators in et namespace;
3131
supported_fallback_kernels: Dict[str, Any] = {
32-
"aoti_torch_mps_addmm_out": None,
3332
"aoti_torch_mps_convolution": None,
3433
"aoti_torch_mps_mm_out": None,
3534
"at::_ops::_scaled_dot_product_attention_math_for_mps::call": None,
@@ -108,34 +107,62 @@ def preprocess(
108107
options: dict[str, typing.Any] = {
109108
# Do not link against the full PyTorch/libtorch library
110109
"aot_inductor.link_libtorch": False,
111-
# Package model constants and other generated files directly in the shared object (.so) file
112-
"aot_inductor.package_constants_in_so": True,
110+
# Separate weight constants from the .so file
111+
"aot_inductor.package": True,
112+
"aot_inductor.package_constants_in_so": False,
113+
# Store weight constants on disk in a binary blob
114+
"aot_inductor.package_constants_on_disk_format": "binary_blob",
113115
# Enable maximum automatic tuning for optimal performance
114116
"max_autotune": True,
115117
# "aot_inductor.debug_compile": True,
116118
# "aot_inductor.force_mmap_weights": False,
117119
}
118120

119121
with collect_unsupported_fallback_kernels():
120-
so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type]
122+
paths = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type]
121123
if len(missing_fallback_kernels) > 0:
122124
formatted_kernels = "\n - ".join(sorted(missing_fallback_kernels))
123125
raise RuntimeError(
124126
f"Missing fallback kernels ({len(missing_fallback_kernels)} total):\n - {formatted_kernels}\n"
125127
"Please add them to the AOTI backend."
126128
)
127129

130+
# Extract the .so and .blob paths from the returned list
131+
so_path = None
132+
blob_path = None
133+
for path in paths:
134+
if path.endswith(".wrapper.so"):
135+
so_path = path
136+
elif path.endswith(".wrapper_weights.blob"):
137+
blob_path = path
138+
139+
if so_path is None or blob_path is None:
140+
raise RuntimeError(
141+
f"Could not find required files in compiled paths, got {paths}"
142+
)
143+
128144
# pyre-ignorep[6]: Incompatible parameter type
129145
with open(so_path, "rb") as f:
130146
so_data = f.read()
131147

132148
named_data_store = NamedDataStore()
133149
method_name = MetalBackend.method_name_from_compile_specs(compile_specs)
150+
151+
# Keep the so file in the NamedDataStore, so that it can be packaged into the .pte file.
152+
named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None)
153+
154+
# Add weights blob to named data store
155+
with open(blob_path, "rb") as f:
156+
blob_data = f.read()
157+
134158
named_data_store.add_named_data(
135-
method_name + "_so_blob", so_data, 1, "aoti_metal_blob"
159+
method_name + "_weights_blob", blob_data, 1, "aoti_metal_blob"
136160
)
137161

138-
# Clean up the generated so file; it has been packaged into the NamdeDataStore
162+
# Clean up the weights blob file
163+
os.remove(blob_path)
164+
165+
# Clean up the generated so file; it has been packaged into the NamedDataStore
139166
# pyre-ignorep[6]: Incompatible parameter type
140167
os.remove(so_path)
141168

backends/apple/metal/runtime/metal_backend.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,15 @@ class ET_EXPERIMENTAL MetalBackend final
106106
Debug,
107107
"MetalBackend::load_function_pointers_into_handle - Loaded AOTInductorModelContainerRun");
108108

109+
LOAD_SYMBOL(
110+
handle,
111+
update_constants_from_blob,
112+
AOTInductorModelUpdateConstantsFromBlob,
113+
so_handle);
114+
ET_LOG(
115+
Debug,
116+
"MetalBackend::load_function_pointers_into_handle - Loaded AOTInductorModelUpdateConstantsFromBlob");
117+
109118
ET_LOG(
110119
Debug,
111120
"MetalBackend::load_function_pointers_into_handle - All symbols loaded successfully");
@@ -203,6 +212,9 @@ class ET_EXPERIMENTAL MetalBackend final
203212
outfile.close();
204213
ET_LOG(Info, "MetalBackend::init - File closed successfully");
205214

215+
// Free the buffer immediately after writing to disk
216+
aoti_metal_buffer->Free();
217+
206218
// Load the ELF using dlopen
207219
void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
208220
ET_CHECK_OR_RETURN_ERROR(
@@ -234,6 +246,20 @@ class ET_EXPERIMENTAL MetalBackend final
234246

235247
handle->container_handle = container_handle;
236248

249+
// Look into named data map for constant data
250+
std::string weights_blob_key =
251+
method_name.empty() ? "weights_blob" : method_name + "_weights_blob";
252+
auto buffer_res = named_data_map->get_data(weights_blob_key.c_str());
253+
if (buffer_res.ok() && handle->update_constants_from_blob != nullptr) {
254+
ET_LOG(Info, "Found %s in named data map", weights_blob_key.c_str());
255+
const void* weights_blob = buffer_res->data();
256+
// Feed the weights blob into the container. Under the hood it's copying
257+
// weights, so we should free the buffer immediately.
258+
ET_CHECK_OK_OR_RETURN_ERROR(handle->update_constants_from_blob(
259+
handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
260+
buffer_res->Free();
261+
}
262+
237263
ET_LOG(Info, "MetalBackend::init - Initialization completed successfully");
238264
return (DelegateHandle*)handle; // Return the handle post-processing
239265
}

backends/apple/metal/runtime/shims/et_metal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,7 @@ extern "C" {
354354

355355
// Memory management functions for Metal
356356
void* metal_allocate_buffer(long bytes);
357+
void metal_deallocate_buffer(void* ptr);
357358
bool metal_is_device_pointer(void* ptr);
358359
int metal_copy_memory(
359360
void* dst,

backends/apple/metal/runtime/shims/et_metal.mm

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,21 @@ void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
8686
}
8787
}
8888

89+
void metal_deallocate_buffer(void* ptr) {
90+
@autoreleasepool {
91+
auto it = ptr_to_mtl_buffer.find(ptr);
92+
if (it != ptr_to_mtl_buffer.end()) {
93+
id<MTLBuffer> buffer = it->second;
94+
[buffer release];
95+
ptr_to_mtl_buffer.erase(it);
96+
ET_LOG(Debug, "Deallocated Metal buffer for pointer %p", ptr);
97+
ptr = nullptr;
98+
} else {
99+
ET_LOG(Error, "Failed to find Metal buffer for pointer %p", ptr);
100+
}
101+
}
102+
}
103+
89104
void metal_cleanup_resources() {
90105
if (!ptr_to_mtl_buffer.empty()) {
91106
@autoreleasepool {
@@ -665,12 +680,16 @@ int metal_copy_memory(void* dst, const void* src, size_t nbytes, bool src_is_dev
665680

666681
// Commit methods
667682
void ETMetalStream::commit() {
668-
if (enableCommitAndContinue_ && commandBuffer_) {
669-
// Use commit-and-continue for better performance
670-
commitAndContinue();
671-
} else {
672-
flush();
683+
if (!commandBuffer_) {
684+
ET_LOG(Error, "ETMetalStream::commit: No command buffer to commit");
685+
return;
673686
}
687+
688+
[commandBuffer_ commit];
689+
ET_LOG(Debug, "ETMetalStream::commit: Committed buffer %p", commandBuffer_);
690+
691+
[commandBuffer_ release];
692+
commandBuffer_ = nil;
674693
}
675694

676695
void ETMetalStream::commitAndWait() {

0 commit comments

Comments
 (0)