Skip to content

Commit 9ce87e3

Browse files
committed
Update base for Update on "[ET-VK] Using a single GPU buffer for all tensor uniforms."
This diff changes Tensor class to store all uniforms in a single uniform buffer. Entities stored in uniforms ie. size, stride, numel and logical limits are now stored in a single buffer and their offsets are stored as unsigned ints in Tensor class. Other changes includes: Adding a new ctor for ParamsBuffer class to allow allocation with size without data ptr. Adding an offset input to Buffer::data function. Adding an offset parameter to BufferBindInfo ctor, so additional offset can be supplied when binding a buffer. Differential Revision: [D65841750](https://our.internmc.facebook.com/intern/diff/D65841750/) [ghstack-poisoned]
2 parents 40b1f5d + b9a1762 commit 9ce87e3

File tree

26 files changed

+249
-176
lines changed

26 files changed

+249
-176
lines changed

.ci/scripts/gather_test_models.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@
2020
CUSTOM_RUNNERS = {
2121
"linux": {
2222
# This one runs OOM on smaller runner, the root cause is unclear (T163016365)
23-
"w2l": "linux.12xlarge",
24-
"ic4": "linux.12xlarge",
25-
"resnet50": "linux.12xlarge",
26-
"llava": "linux.12xlarge",
27-
"llama3_2_vision_encoder": "linux.12xlarge",
28-
# "llama3_2_text_decoder": "linux.12xlarge", # TODO: re-enable test when Huy's change is in / model gets smaller.
23+
"w2l": "linux.4xlarge.memory",
24+
"ic4": "linux.4xlarge.memory",
25+
"resnet50": "linux.4xlarge.memory",
26+
"llava": "linux.4xlarge.memory",
27+
"llama3_2_vision_encoder": "linux.4xlarge.memory",
28+
"llama3_2_text_decoder": "linux.4xlarge.memory",
2929
# This one causes timeout on smaller runner, the root cause is unclear (T161064121)
30-
"dl3": "linux.12xlarge",
31-
"emformer_join": "linux.12xlarge",
32-
"emformer_predict": "linux.12xlarge",
30+
"dl3": "linux.4xlarge.memory",
31+
"emformer_join": "linux.4xlarge.memory",
32+
"emformer_predict": "linux.4xlarge.memory",
3333
}
3434
}
3535

@@ -39,10 +39,12 @@
3939
"linux": {
4040
"mobilebert": 90,
4141
"emformer_predict": 360,
42+
"llama3_2_text_decoder": 360,
4243
},
4344
"macos": {
4445
"mobilebert": 90,
4546
"emformer_predict": 360,
47+
"llama3_2_text_decoder": 360,
4648
},
4749
}
4850

.ci/scripts/setup-macos.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ install_buck() {
4949

5050
rm "${BUCK2}"
5151
popd
52+
53+
# Kill all running buck2 daemon for a fresh start
54+
buck2 killall || true
5255
}
5356

5457
function write_sccache_stub() {

.github/workflows/apple.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ jobs:
4242
4343
build-demo-ios:
4444
name: build-demo-ios
45+
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
46+
if: ${{ !github.event.pull_request.head.repo.fork }}
4547
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
4648
secrets: inherit
4749
with:
@@ -190,6 +192,8 @@ jobs:
190192
) done
191193
192194
upload-frameworks-ios:
195+
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
196+
if: ${{ !github.event.pull_request.head.repo.fork }}
193197
runs-on: ubuntu-22.04
194198
needs: [build-frameworks-ios, set-version]
195199
timeout-minutes: 30
@@ -278,6 +282,8 @@ jobs:
278282
279283
build-benchmark-app:
280284
name: build-benchmark-app
285+
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
286+
if: ${{ !github.event.pull_request.head.repo.fork }}
281287
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
282288
secrets: inherit
283289
with:

.github/workflows/pull.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ jobs:
332332
docker-image: executorch-ubuntu-22.04-clang12
333333

334334
unittest-arm:
335-
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
335+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
336336
with:
337337
runner: linux.2xlarge
338338
docker-image: executorch-ubuntu-22.04-arm-sdk

.github/workflows/trunk.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ jobs:
131131
132132
test-arm-backend-delegation:
133133
name: test-arm-backend-delegation
134-
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
134+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
135135
with:
136136
runner: linux.2xlarge
137137
docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -157,7 +157,7 @@ jobs:
157157
158158
test-arm-reference-delegation:
159159
name: test-arm-reference-delegation
160-
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
160+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
161161
with:
162162
runner: linux.2xlarge
163163
docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -351,6 +351,8 @@ jobs:
351351
done
352352
353353
test-huggingface-transformers:
354+
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
355+
if: ${{ !github.event.pull_request.head.repo.fork }}
354356
name: test-huggingface-transformers
355357
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
356358
secrets: inherit

backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,6 @@ - (void)testMV3ProgramDebugging {
151151
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
152152
XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
153153
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
154-
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
155154
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
156155
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
157156
}

backends/apple/coreml/runtime/test/ETCoreMLModelProfilerTests.mm

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,6 @@ - (void)testMV3ProgramProfiling {
146146
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
147147
XCTAssertNotNil(profilingResult[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
148148
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
149-
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
150149
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
151150
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
152151
};

backends/arm/_passes/cast_int64_pass.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,15 @@
55

66
# pyre-unsafe
77

8+
import logging
9+
810
import torch
11+
from executorch.backends.arm._passes.arm_pass_utils import is_param_node
912
from executorch.exir.pass_base import ExportPass, PassResult
13+
from torch._export.utils import is_buffer
14+
15+
logger = logging.getLogger(__name__)
16+
logger.setLevel(logging.WARNING)
1017

1118

1219
class CastInt64ToInt32Pass(ExportPass):
@@ -18,17 +25,31 @@ def _to_int32(self, graph_module: torch.fx.GraphModule):
1825
for node in graph_module.graph.nodes:
1926
fake_tensor = node.meta["val"]
2027
if isinstance(fake_tensor, torch._subclasses.fake_tensor.FakeTensor):
21-
if node.meta["val"].dtype == torch.int64:
22-
node.meta["val"] = node.meta["val"].to(torch.int32)
23-
buffer_name = (
24-
self.exported_program.graph_signature.inputs_to_buffers[
25-
node.name
26-
]
27-
)
28-
new_tensor = self.exported_program.state_dict[buffer_name].to(
29-
torch.int32
30-
)
31-
self.exported_program.state_dict[buffer_name] = new_tensor
28+
if node.meta["val"].dtype == torch.int64 and is_param_node(
29+
self.exported_program, node
30+
):
31+
if is_buffer(self.exported_program, node):
32+
node.meta["val"] = node.meta["val"].to(torch.int32)
33+
buffer_name = (
34+
self.exported_program.graph_signature.inputs_to_buffers[
35+
node.name
36+
]
37+
)
38+
buffer = self.exported_program.state_dict[node.name]
39+
logger.warning(
40+
f"Casting buffer {node.name} from torch.int64 to torch.int32"
41+
f" defined in {node.meta['stack_trace']}"
42+
)
43+
if torch.min(buffer) < torch.iinfo(torch.int32).min:
44+
raise RuntimeError(
45+
f"Buffer {node.name} has value < {torch.iinfo(torch.int32).min}"
46+
)
47+
if torch.max(buffer) > torch.iinfo(torch.int32).max:
48+
raise RuntimeError(
49+
f"Buffer {node.name} has value > {torch.iinfo(torch.int32).max}"
50+
)
51+
buffer_int32 = buffer.to(torch.int32)
52+
self.exported_program.state_dict[buffer_name] = buffer_int32
3253

3354
def call(self, graph_module: torch.fx.GraphModule):
3455
self._to_int32(graph_module)

backends/arm/_passes/scalars_to_attribute_pass.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ def call(self, graph_module: GraphModule) -> PassResult:
5151
if isinstance(arg, Node):
5252
new_args.append(arg)
5353
continue
54+
if isinstance(arg, int) and not torch.is_floating_point(
55+
get_first_fake_tensor(n)
56+
):
57+
new_args.append(arg)
58+
continue
5459

5560
prefix = "_tensor_constant_"
5661
get_new_attr_name = get_new_attr_name_with_prefix(prefix)

backends/arm/test/ops/test_scalars.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,12 @@ def forward(self, x):
7575
x = 1.0 + x
7676
return x
7777

78+
class ShiftInplaceSub(torch.nn.Module):
79+
def forward(self, x):
80+
x = x >> 4
81+
x -= 10
82+
return x
83+
7884
# Inplace ops end with '_' (from aten naming)
7985
ops = [
8086
("Add", Add()),
@@ -160,3 +166,6 @@ def test_MI_const(self, test_name: str, op: torch.nn.Module, x):
160166
@parameterized.expand(tensor_scalar_tests)
161167
def test_BI(self, test_name: str, op: torch.nn.Module, x, y):
162168
self._test_add_tosa_BI_pipeline(op, (x, y))
169+
170+
def test_shift_sub_inplace_tosa_MI(self):
171+
self._test_add_tosa_MI_pipeline(self.ShiftInplaceSub(), (torch.IntTensor(5),))

0 commit comments

Comments
 (0)