Skip to content

Commit 51ae252

Browse files
committed
Update base for Update on "[Executorch][llm] Add ring buffer based kv cache and mask calculation to MHA"
Leveraging previous work now we allow MHA to have ring buffer cache. If ring buffer cache is used then we query the mask from kv cache and use that for sdpa instead of using precalculated mask. In this process we had to adjsut ring buffer implementation to allow keeping the context of full sliding window. See code for comment. Differential Revision: [D73891425](https://our.internmc.facebook.com/intern/diff/D73891425/) [ghstack-poisoned]
2 parents dcf7c5a + cd3b53d commit 51ae252

File tree

186 files changed

+6628
-1314
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

186 files changed

+6628
-1314
lines changed

.ci/scripts/build-qnn-sdk.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ set_up_aot() {
3333
cmake .. \
3434
-DCMAKE_INSTALL_PREFIX=$PWD \
3535
-DEXECUTORCH_BUILD_QNN=ON \
36+
-DANDROID_NATIVE_API_LEVEL=30 \
3637
-DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
3738
-DEXECUTORCH_BUILD_DEVTOOLS=ON \
3839
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \

.ci/scripts/test_model.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ test_model_with_coreml() {
222222

223223
DTYPE=float16
224224

225-
"${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision "${DTYPE}"
225+
"${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision "${DTYPE}" --use_partitioner
226226
EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
227227

228228
if [ -n "$EXPORTED_MODEL" ]; then

.github/workflows/_link_check.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
on:
2+
workflow_call:
3+
inputs:
4+
ref:
5+
type: string
6+
required: true
7+
8+
jobs:
9+
lint-urls:
10+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
11+
with:
12+
runner: linux.2xlarge
13+
docker-image: executorch-ubuntu-22.04-linter
14+
submodules: 'none'
15+
fetch-depth: 0
16+
ref: ${{ inputs.ref }}
17+
timeout: 90
18+
script: |
19+
./scripts/lint_urls.sh $(
20+
[ "${{ github.event_name }}" = "pull_request" ] \
21+
&& git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
22+
|| [ "${{ github.event_name }}" = "push" ] \
23+
&& git diff --name-only ${{ github.event.before }} ${{ github.sha }}
24+
)
25+
26+
lint-xrefs:
27+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
28+
with:
29+
runner: linux.2xlarge
30+
docker-image: executorch-ubuntu-22.04-linter
31+
submodules: 'none'
32+
fetch-depth: 0
33+
ref: ${{ inputs.ref }}
34+
timeout: 90
35+
script: |
36+
./scripts/lint_xrefs.sh $(
37+
[ "${{ github.event_name }}" = "pull_request" ] \
38+
&& git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
39+
|| [ "${{ github.event_name }}" = "push" ] \
40+
&& git diff --name-only ${{ github.event.before }} ${{ github.sha }}
41+
)

.github/workflows/lint.yml

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -64,29 +64,10 @@ jobs:
6464
6565
exit $RC
6666
67-
lint-urls:
68-
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
67+
link-check:
68+
uses: ./.github/workflows/_link_check.yml
6969
with:
70-
runner: linux.2xlarge
71-
docker-image: executorch-ubuntu-22.04-linter
72-
submodules: 'none'
73-
fetch-depth: 0
7470
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
75-
timeout: 90
76-
script: |
77-
./scripts/lint_urls.sh
78-
79-
lint-xrefs:
80-
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
81-
with:
82-
runner: linux.2xlarge
83-
docker-image: executorch-ubuntu-22.04-linter
84-
submodules: 'none'
85-
fetch-depth: 0
86-
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
87-
timeout: 90
88-
script: |
89-
./scripts/lint_xrefs.sh
9071

9172
android-java-format:
9273
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main

.github/workflows/nightly.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,9 @@ jobs:
3030
test-infra-ref: main
3131
updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
3232
pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
33+
34+
link-check:
35+
needs: update-pytorch-commit-hash
36+
uses: ./.github/workflows/_link_check.yml
37+
with:
38+
ref: ${{ github.sha }}

Package.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,9 @@ let package = Package(
7777
name: "\(key)_dependencies",
7878
dependencies: [.target(name: key)],
7979
path: ".Package.swift/\(key)",
80-
linkerSettings:
80+
linkerSettings: [
81+
.linkedLibrary("c++")
82+
] +
8183
(value["frameworks"] as? [String] ?? []).map { .linkedFramework($0) } +
8284
(value["libraries"] as? [String] ?? []).map { .linkedLibrary($0) }
8385
),

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ To get started you can:
5151

5252
- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
5353
- Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
54-
- Jump straight into LLM use cases by following specific instructions for [Llama](examples/models/llama/README.md) and [Llava](examples/models/llava/README.md)
54+
- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), and [Llava](examples/models/llava/README.md)
5555

5656
## Feedback and Engagement
5757

backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,17 @@
8888
ET_LOG(Error, "%s: DataType=%d is not supported", ETCoreMLStrings.delegateIdentifier.UTF8String, (int)tensor.scalar_type());
8989
return std::nullopt;
9090
}
91-
91+
9292
std::vector<ssize_t> strides(tensor.strides().begin(), tensor.strides().end());
9393
std::vector<size_t> shape(tensor.sizes().begin(), tensor.sizes().end());
94+
95+
// If tensor is rank 0, wrap in rank 1
96+
// See https://github.com/apple/coremltools/blob/8.2/coremltools/converters/mil/frontend/torch/exir_utils.py#L73
97+
if (shape.size() == 0) {
98+
shape.push_back(1);
99+
strides.push_back(1);
100+
}
101+
94102
MultiArray::MemoryLayout layout(dataType.value(), std::move(shape), std::move(strides));
95103
switch (argType) {
96104
case ArgType::Input: {
@@ -233,6 +241,12 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
233241
std::array<SizesType, kTensorDimensionLimit> new_shape;
234242
for (size_t i = nInputs; i < nInputs + nOutputs; i++) {
235243
Tensor& t = args[i]->toTensor();
244+
// If t has rank 0, do not resize. delegate_args[i] will have rank 1
245+
// because we resized it in get_multi_array
246+
if (t.dim() == 0) {
247+
continue;
248+
}
249+
236250
int rank = delegate_args[i].layout().rank();
237251
assert (rank <= new_shape.size());
238252
for (int d = 0; d < rank; d++) {

backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
#import <XCTest/XCTest.h>
1616
#import <executorch/runtime/platform/runtime.h>
1717
#import <model_logging_options.h>
18+
#import <multiarray.h>
19+
20+
using namespace executorchcoreml;
1821

1922
@interface ETCoreMLModelManagerTests : XCTestCase
2023

@@ -110,7 +113,7 @@ - (void)testAddModelExecution {
110113
XCTAssertNotNil(inputs);
111114
MLMultiArray *output = [ETCoreMLTestUtils filledMultiArrayWithShape:inputs[0].shape dataType:inputs[0].dataType repeatedValue:@(0) error:&localError];
112115
NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
113-
XCTAssertTrue([self.modelManager executeModelWithHandle:handle
116+
XCTAssertTrue([self.modelManager executeModelWithHandle:handle
114117
args:args
115118
loggingOptions:executorchcoreml::ModelLoggingOptions()
116119
eventLogger:nullptr
@@ -148,4 +151,77 @@ - (void)testMulModelExecution {
148151
}
149152
}
150153

154+
// See https://github.com/pytorch/executorch/pull/10465
155+
- (void)testAutoreleasepoolError {
156+
NSURL *modelURL = [self.class bundledResourceWithName:@"add_coreml_all" extension:@"bin"];
157+
NSError *localError = nil;
158+
XCTAssertNotNil(modelURL);
159+
160+
NSData *modelData = [NSData dataWithContentsOfURL:modelURL];
161+
MLModelConfiguration *configuration = [[MLModelConfiguration alloc] init];
162+
configuration.computeUnits = MLComputeUnitsAll;
163+
ModelHandle *modelHandle = [self.modelManager loadModelFromAOTData:modelData
164+
configuration:configuration
165+
error:&localError];
166+
XCTAssert(modelHandle);
167+
168+
ETCoreMLModel *model = [self.modelManager modelWithHandle:modelHandle];
169+
XCTAssert(model);
170+
171+
NSArray<MLMultiArray *> *inputArrays =
172+
[ETCoreMLTestUtils inputsForModel:model repeatedValues:@[@(2), @(3)] error:&localError];
173+
XCTAssert(inputArrays);
174+
175+
std::vector<MultiArray> multiArrays;
176+
multiArrays.reserve(inputArrays.count + model.orderedOutputNames.count);
177+
for (MLMultiArray *array in inputArrays) {
178+
auto dataTypeOpt = to_multiarray_data_type(array.dataType);
179+
XCTAssert(dataTypeOpt.has_value());
180+
auto dataType = dataTypeOpt.value();
181+
182+
std::vector<size_t> dims;
183+
for (NSNumber *n in array.shape) {
184+
dims.push_back(n.unsignedLongValue);
185+
}
186+
187+
std::vector<ssize_t> strides(dims.size());
188+
ssize_t currentStride = 1;
189+
for (NSInteger i = dims.size() - 1; i >= 0; --i) {
190+
strides[i] = currentStride;
191+
currentStride *= dims[i];
192+
}
193+
194+
multiArrays.emplace_back(array.dataPointer,
195+
MultiArray::MemoryLayout(dataType, dims, strides));
196+
}
197+
198+
auto inputLayout = multiArrays[0].layout();
199+
size_t bufferSize = inputLayout.num_bytes();
200+
for (NSUInteger i = 0; i < model.orderedOutputNames.count; ++i) {
201+
multiArrays.emplace_back(calloc(1, bufferSize), inputLayout);
202+
}
203+
// corrupt first input shape to force error
204+
{
205+
auto originalLayout = multiArrays[0].layout();
206+
auto corruptedDims = originalLayout.shape();
207+
corruptedDims[0] += 1;
208+
multiArrays[0] = MultiArray(multiArrays[0].data(),
209+
MultiArray::MemoryLayout(originalLayout.dataType(),
210+
corruptedDims,
211+
originalLayout.strides()));
212+
}
213+
214+
BOOL success = [self.modelManager executeModelWithHandle:modelHandle
215+
argsVec:multiArrays
216+
loggingOptions:ModelLoggingOptions()
217+
eventLogger:nullptr
218+
error:&localError];
219+
XCTAssertFalse(success);
220+
XCTAssertNotNil(localError);
221+
222+
for (size_t i = inputArrays.count; i < multiArrays.size(); ++i) {
223+
free(multiArrays[i].data());
224+
}
225+
}
226+
151227
@end

backends/apple/coreml/scripts/install_requirements.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ SCRIPT_DIR_PATH="$(
1212

1313
# TODO(jathu): remove the need to fetch coremltools to build deps for coreml_executor_runner.
1414
# Keep this version in sync with: pyproject.toml
15-
COREMLTOOLS_VERSION="8.2"
15+
COREMLTOOLS_VERSION="8.3"
1616

1717
red=`tput setaf 1`
1818
green=`tput setaf 2`

0 commit comments

Comments
 (0)