Skip to content

Commit 3a77f70

Browse files
committed
Update on "[devtool] introduce datasink class to etdump"
this diff introduce datasink class, the class for managing the customized debug data storage pipeline. Detials can be found in https://docs.google.com/document/d/1y_m32mKdj-OgLcLUz9TKhBW3PC3bBDYSBbeAH544EfM/edit?tab=t.0 Differential Revision: [D69583422](https://our.internmc.facebook.com/intern/diff/D69583422/) [ghstack-poisoned]
2 parents 5af97e3 + da8c6b1 commit 3a77f70

File tree

23 files changed

+709
-50
lines changed

23 files changed

+709
-50
lines changed

backends/apple/coreml/partition/coreml_partitioner.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Please refer to the license found in the LICENSE file in the root directory of the source tree.
44

55
import logging
6-
from typing import List, Optional
6+
from typing import Callable, List, Optional, Tuple
77

88
import coremltools as ct
99

@@ -104,3 +104,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
104104
return PartitionResult(
105105
tagged_exported_program=exported_program, partition_tags=partition_tags
106106
)
107+
108+
def ops_to_not_decompose(
109+
self, ep: ExportedProgram
110+
) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
111+
do_not_decompose = []
112+
op_support = OperatorsSupportedForCoreMLBackend()
113+
for node in ep.graph.nodes:
114+
if (
115+
node.op == "call_function"
116+
and isinstance(node.target, torch._ops.OpOverload)
117+
and op_support.is_node_supported(None, node)
118+
):
119+
do_not_decompose.append(node.target)
120+
return do_not_decompose, None

backends/apple/coreml/test/test_coreml_partitioner.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from executorch.backends.apple.coreml.compiler import CoreMLBackend
1515
from executorch.backends.apple.coreml.partition import CoreMLPartitioner
16+
from executorch.exir.backend.utils import format_delegated_graph
1617

1718

1819
class TestCoreMLPartitioner(unittest.TestCase):
@@ -79,6 +80,50 @@ def test_vit_skip_conv(self):
7980
"getitem",
8081
]
8182

83+
def test_ops_to_not_decompose(self):
84+
class Model(torch.nn.Module):
85+
def forward(self, q, k, v, mask):
86+
return torch.ops.aten.scaled_dot_product_attention.default(
87+
q, k, v, attn_mask=mask
88+
)
89+
90+
model = Model()
91+
model.eval()
92+
93+
batch_size = 1
94+
n_heads = 12
95+
seq_len = 1
96+
max_seq_length = 32
97+
embedding_dim = 16
98+
q = torch.randn(batch_size, n_heads, seq_len, embedding_dim)
99+
k = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
100+
v = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
101+
mask = torch.randn(seq_len, max_seq_length)
102+
example_inputs = (q, k, v, mask)
103+
ep = torch.export.export(model, example_inputs)
104+
coreml_partitioner = CoreMLPartitioner()
105+
106+
# Using to_edge_transform_and_lower, we expect SDPA will be preserved and show up in delegated graph
107+
edge_program_manager = executorch.exir.to_edge_transform_and_lower(
108+
ep, partitioner=[coreml_partitioner]
109+
)
110+
self.assertTrue(
111+
"executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
112+
in format_delegated_graph(
113+
edge_program_manager.exported_program().graph_module
114+
)
115+
)
116+
117+
# Using to_edge flow, we expect SDPA will be decomposed and not show up in delegated graph
118+
edge_program_manager2 = executorch.exir.to_edge(ep)
119+
edge_program_manager2.to_backend(coreml_partitioner)
120+
self.assertTrue(
121+
"executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
122+
not in format_delegated_graph(
123+
edge_program_manager2.exported_program().graph_module
124+
)
125+
)
126+
82127
def test_buffer(self):
83128
embedding_dim = 3
84129
max_seq_len = 2
@@ -129,4 +174,5 @@ def forward(self, q, k_val, input_pos):
129174
test_runner = TestCoreMLPartitioner()
130175
test_runner.test_add_sub_skip_mm()
131176
test_runner.test_vit_skip_conv()
177+
test_runner.test_ops_to_not_decompose()
132178
test_runner.test_buffer()

backends/arm/_passes/fuse_batchnorm2d_pass.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ def try_set_param(
114114
if not try_set_param(conv_bias_node, fused_conv_bias) and try_set_param(
115115
bn_bias_node, fused_conv_bias
116116
):
117+
# pyre-ignore[60]
117118
# Conv didn't have bias but batchnorm did, steal bias from batchnorm.
118119
conv_args = (*conv.args[0:2], bn_bias_node, *conv.args[3:])
119120
conv.args = conv_args

backends/cadence/aot/functions_hifi.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,11 +204,21 @@
204204
- arg_meta: null
205205
kernel_name: cadence::impl::HiFi::quantized_linear_out
206206

207+
- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
208+
kernels:
209+
- arg_meta: null
210+
kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
211+
207212
- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
208213
kernels:
209214
- arg_meta: null
210215
kernel_name: cadence::impl::HiFi::quantized_relu_out
211216

217+
- func: cadence::quantized_relu.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
218+
kernels:
219+
- arg_meta: null
220+
kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out
221+
212222
- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
213223
kernels:
214224
- arg_meta: null

backends/cadence/hifi/operators/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ target_include_directories(
7676

7777
# Custom ops that are needed to run the test model.
7878
add_library(
79-
custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp"
80-
"quantize_per_tensor.cpp" "quantized_relu_out.cpp" "dequantize_per_tensor.cpp"
79+
custom_ops "op_quantized_linear_out.cpp" "op_quantized_layer_norm.cpp"
80+
"op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp"
8181
)
8282
target_include_directories(
8383
custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}

backends/cadence/hifi/operators/op_clamp.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,16 @@ Tensor& clamp_Tensor_out(
321321

322322
return out;
323323
}
324+
325+
Tensor& clamp_tensor_out(
326+
RuntimeContext& ctx,
327+
const Tensor& in,
328+
const executorch::aten::optional<Tensor>& min_opt,
329+
const executorch::aten::optional<Tensor>& max_opt,
330+
Tensor& out) {
331+
clamp_Tensor_out(ctx, in, min_opt, max_opt, out);
332+
}
333+
324334
} // namespace native
325335
} // namespace HiFi
326336
} // namespace impl

backends/cadence/hifi/operators/op_mean.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,16 @@ Tensor& mean_out(
168168
return out;
169169
}
170170

171+
Tensor& mean_dim_out(
172+
RuntimeContext& ctx,
173+
const Tensor& in,
174+
optional<ArrayRef<int64_t>> dim_list,
175+
bool keepdim,
176+
optional<ScalarType> dtype,
177+
Tensor& out) {
178+
mean_out(ctx, in, dim_list, keepdim, dtype, out);
179+
}
180+
171181
} // namespace native
172182
} // namespace HiFi
173183
} // namespace impl

backends/cadence/hifi/operators/op_quantized_relu_out.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,46 @@ void quantized_relu_per_tensor_out(
7575
}
7676
}
7777

78+
void quantized_relu_per_tensor_out(
79+
KernelRuntimeContext& ctx,
80+
const Tensor& input,
81+
const Tensor& in_zero_point,
82+
const int64_t out_zero_point,
83+
const Tensor& out_multiplier,
84+
const Tensor& out_shift,
85+
Tensor& output) {
86+
int8_t _in_zero_point = in_zero_point.const_data_ptr<int8_t>()[0];
87+
int32_t _out_multiplier = out_multiplier.const_data_ptr<int32_t>()[0];
88+
int32_t _out_shift = out_shift.const_data_ptr<int32_t>()[0];
89+
90+
quantized_relu_per_tensor_out(
91+
ctx,
92+
input,
93+
_in_zero_point,
94+
out_zero_point,
95+
_out_multiplier,
96+
_out_shift,
97+
output);
98+
}
99+
100+
void quantized_relu_out(
101+
KernelRuntimeContext& ctx,
102+
const Tensor& input,
103+
const int64_t in_zero_point,
104+
const int64_t out_zero_point,
105+
const int64_t out_multiplier,
106+
const int64_t out_shift,
107+
Tensor& output) {
108+
quantized_relu_per_tensor_out(
109+
ctx,
110+
input,
111+
in_zero_point,
112+
out_zero_point,
113+
out_multiplier,
114+
out_shift,
115+
output);
116+
}
117+
78118
} // namespace native
79119
} // namespace HiFi
80120
} // namespace impl

backends/cadence/hifi/operators/op_softmax.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,15 @@ Tensor& _softmax_out(
194194
return out;
195195
}
196196

197+
Tensor& softmax_out(
198+
KernelRuntimeContext& ctx,
199+
const Tensor& in,
200+
int64_t dim,
201+
bool half_to_float,
202+
Tensor& out) {
203+
_softmax_out(ctx, in, dim, half_to_float, out);
204+
}
205+
197206
} // namespace native
198207
} // namespace HiFi
199208
} // namespace impl

build/run_android_emulator.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ $ADB_PATH wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; d
1818
echo "List all running emulators"
1919
$ADB_PATH devices
2020

21+
adb uninstall com.example.executorchllamademo || true
22+
adb uninstall com.example.executorchllamademo.test || true
2123
adb install -t app-debug.apk
2224
adb install -t app-debug-androidTest.apk
2325

@@ -26,6 +28,8 @@ adb push model.pte /data/local/tmp/llama
2628
adb push tokenizer.bin /data/local/tmp/llama
2729
adb shell am instrument -w -r com.example.executorchllamademo.test/androidx.test.runner.AndroidJUnitRunner
2830

31+
adb uninstall org.pytorch.executorch || true
32+
adb uninstall org.pytorch.executorch.test || true
2933
adb install -t android-test-debug.apk
3034
adb install -t android-test-debug-androidTest.apk
3135

0 commit comments

Comments
 (0)