Skip to content

Commit 32f7770

Browse files
committed
Update base for Update on "Remove ExecuTorch copy of Vectorized"
All uses are outside ExecuTorch core, so we can just use ATen Vectorized. Differential Revision: [D66396016](https://our.internmc.facebook.com/intern/diff/D66396016/) [ghstack-poisoned]
2 parents 65c1c8a + 2967302 commit 32f7770

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+3454
-134
lines changed

.ci/scripts/test_llama.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ while [[ $# -gt 0 ]]; do
2727
MODE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
2828
shift 2
2929
;;
30+
-pt2e_quantize)
31+
PT2E_QUANTIZE="$2"
32+
shift 2
33+
;;
3034
-upload)
3135
UPLOAD_DIR="$2"
3236
shift 2
@@ -44,6 +48,9 @@ MODE=${MODE:-"xnnpack+custom"}
4448
# Default UPLOAD_DIR to empty string if not set
4549
UPLOAD_DIR="${UPLOAD_DIR:-}"
4650

51+
# Default PT2E_QUANTIZE to empty string if not set
52+
PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"
53+
4754
if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
4855
echo "Expecting atleast 4 positional arguments"
4956
echo "Usage: [...]"
@@ -234,6 +241,10 @@ if [[ "${COREML}" == "ON" ]]; then
234241
fi
235242
if [[ "${QNN}" == "ON" ]]; then
236243
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
244+
echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
245+
if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
246+
EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
247+
fi
237248
fi
238249
# Add dynamically linked library location
239250
$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}

.github/workflows/build-wheels-linux.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ jobs:
2727
test-infra-ref: main
2828
with-cuda: disabled
2929
with-rocm: disabled
30+
python-versions: '["3.10", "3.11", "3.12"]'
3031

3132
build:
3233
needs: generate-matrix

.github/workflows/build-wheels-m1.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ jobs:
2727
test-infra-ref: main
2828
with-cuda: disabled
2929
with-rocm: disabled
30+
python-versions: '["3.10", "3.11", "3.12"]'
3031

3132
build:
3233
needs: generate-matrix

.github/workflows/pull.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ jobs:
368368
strategy:
369369
matrix:
370370
dtype: [fp32]
371+
pt2e_quantize: [qnn_16a16w, qnn_8a8w]
371372
mode: [qnn]
372373
fail-fast: false
373374
with:
@@ -384,6 +385,7 @@ jobs:
384385
DTYPE=${{ matrix.dtype }}
385386
BUILD_TOOL="cmake"
386387
MODE=${{ matrix.mode }}
388+
PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
387389
388390
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
389391
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
@@ -393,7 +395,7 @@ jobs:
393395
# Install requirements for export_llama
394396
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
395397
# Test llama2
396-
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}"
398+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
397399
398400
test-phi-3-mini-runner-linux:
399401
name: test-phi-3-mini-runner-linux

.github/workflows/trunk.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,3 +441,39 @@ jobs:
441441
442442
cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
443443
echo "::endgroup::"
444+
445+
446+
test-llama-runner-qnn-linux:
447+
name: test-llama-runner-qnn-linux
448+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
449+
strategy:
450+
matrix:
451+
dtype: [fp32]
452+
pt2e_quantize: [qnn_16a16w, qnn_8a8w]
453+
mode: [qnn]
454+
fail-fast: false
455+
with:
456+
runner: linux.2xlarge
457+
docker-image: executorch-ubuntu-22.04-qnn-sdk
458+
submodules: 'true'
459+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
460+
timeout: 900
461+
script: |
462+
# The generic Linux job chooses to use base env, not the one setup by the image
463+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
464+
conda activate "${CONDA_ENV}"
465+
466+
BUILD_TOOL="cmake"
467+
DTYPE=${{ matrix.dtype }}
468+
MODE=${{ matrix.mode }}
469+
PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
470+
471+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
472+
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
473+
474+
# Setup executorch
475+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
476+
# Install requirements for export_llama
477+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
478+
# Test llama2
479+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@
6464
[submodule "third-party/pybind11"]
6565
path = third-party/pybind11
6666
url = https://github.com/pybind/pybind11.git
67+
[submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"]
68+
path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3
69+
url = https://github.com/foss-xtensa/nnlib-FusionG3/
6770
[submodule "third-party/ao"]
6871
path = third-party/ao
6972
url = https://github.com/pytorch/ao.git

CMakeLists.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,8 +338,6 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch")
338338
)
339339
endif()
340340
set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type)
341-
# We don't need any of C10's CMake macros.
342-
add_definitions(-DC10_USING_CUSTOM_GENERATED_MACROS)
343341

344342
#
345343
# The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -522,6 +520,7 @@ endif()
522520
target_include_directories(
523521
executorch_core PUBLIC ${_common_include_directories}
524522
)
523+
target_compile_definitions(executorch_core PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
525524
target_compile_options(executorch_core PUBLIC ${_common_compile_options})
526525
if(MAX_KERNEL_NUM)
527526
target_compile_definitions(
@@ -542,6 +541,7 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE)
542541
target_include_directories(
543542
executorch_core_shared PUBLIC ${_common_include_directories}
544543
)
544+
target_compile_definitions(executorch_core_shared PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
545545
target_compile_options(
546546
executorch_core_shared PUBLIC ${_common_compile_options}
547547
)
@@ -562,6 +562,7 @@ endif()
562562
add_library(executorch ${_executorch__srcs})
563563
target_link_libraries(executorch PRIVATE executorch_core)
564564
target_include_directories(executorch PUBLIC ${_common_include_directories})
565+
target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
565566
target_compile_options(executorch PUBLIC ${_common_compile_options})
566567
target_link_options_shared_lib(executorch)
567568

@@ -753,6 +754,8 @@ if(EXECUTORCH_BUILD_PYBIND)
753754
target_include_directories(
754755
util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
755756
)
757+
target_compile_definitions(util PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
758+
756759
target_compile_options(util PUBLIC ${_pybind_compile_options})
757760
target_link_libraries(util PRIVATE torch c10 executorch extension_tensor)
758761

backends/apple/coreml/scripts/install_requirements.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
2424
mkdir "$COREML_DIR_PATH/third-party"
2525

2626
echo "${green}ExecuTorch: Cloning coremltools."
27-
git clone --depth 1 --branch 8.0 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
27+
git clone --depth 1 --branch 8.1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
2828
cd $COREMLTOOLS_DIR_PATH
2929

3030
STATUS=$?

backends/apple/coreml/test/test_coreml_partitioner.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -71,23 +71,15 @@ def test_vit_skip_conv(self):
7171
)
7272
)
7373

74-
conv_block = ["aten.convolution.default", "executorch_call_delegate"]
75-
safe_softmax_block = [
76-
"getitem",
77-
"getitem",
78-
"getitem",
79-
"getitem",
80-
"aten.any.dim",
81-
"executorch_call_delegate",
82-
]
83-
final_block = ["getitem"]
84-
total = conv_block + 12 * safe_softmax_block + final_block
85-
8674
assert [
8775
node.target.__name__
8876
for node in delegated_program_manager.exported_program().graph.nodes
8977
if node.op == "call_function"
90-
] == total
78+
] == [
79+
"aten.convolution.default",
80+
"executorch_call_delegate",
81+
"getitem",
82+
]
9183

9284
def test_buffer(self):
9385
embedding_dim = 3

backends/arm/arm_backend.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def __init__(self):
5252
self.permute_nhwc = False
5353
self.quantize_io = False
5454
self.tosa_version = None
55+
self.input_order = None
5556

5657
def ethosu_compile_spec(
5758
self,
@@ -89,7 +90,7 @@ def ethosu_compile_spec(
8990
self.compiler_flags.append(extra_flags)
9091

9192
base_tosa_version = "TOSA-0.80.0+BI"
92-
if "U55" in config:
93+
if "u55" in config:
9394
# Add the Ethos-U55 extension marker
9495
base_tosa_version += "+u55"
9596
self.tosa_version = TosaSpecification.create_from_string(base_tosa_version)
@@ -134,6 +135,14 @@ def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
134135
self.quantize_io = quantize_io
135136
return self
136137

138+
def set_input_order(self, input_order: str = None) -> "ArmCompileSpecBuilder":
139+
"""
140+
Reorder the inputs coming in. This may be required when inputs > 1.
141+
And while using the U55/U85 CompileSpec.
142+
"""
143+
self.input_order = input_order
144+
return self
145+
137146
def build(self) -> List[CompileSpec]:
138147
"""
139148
Generate a list of compile spec objects from the builder
@@ -163,6 +172,13 @@ def build(self) -> List[CompileSpec]:
163172
CompileSpec("permute_memory_format", "nhwc".encode())
164173
)
165174

175+
if self.input_order:
176+
self.compile_spec.append(
177+
CompileSpec(
178+
"input_order", " ".join(map(str, self.input_order)).encode()
179+
)
180+
)
181+
166182
if self.quantize_io:
167183
self.compile_spec.append(CompileSpec("quantize_io", "True".encode()))
168184

@@ -214,13 +230,16 @@ def preprocess( # noqa: C901
214230
artifact_path = None
215231
output_format = ""
216232
compile_flags = []
233+
input_order = []
217234
for spec in compile_spec:
218235
if spec.key == "debug_artifact_path":
219236
artifact_path = spec.value.decode()
220237
if spec.key == "output_format":
221238
output_format = spec.value.decode()
222239
if spec.key == "compile_flags":
223240
compile_flags.append(spec.value.decode())
241+
if spec.key == "input_order":
242+
input_order = list(map(int, spec.value.decode().split(",")))
224243

225244
# Check that the output format is set in the compile spec
226245
if not output_format:
@@ -246,19 +265,27 @@ def preprocess( # noqa: C901
246265
)
247266

248267
node_visitors = get_node_visitors(edge_program, tosa_spec)
249-
268+
input_count = 0
250269
for node in graph_module.graph.nodes:
251270
if node.op == "call_function":
252271
process_call_function(node, tosa_graph, node_visitors, tosa_spec)
253272
elif node.op == "placeholder":
254273
process_placeholder(node, tosa_graph, edge_program, tosa_spec)
274+
if node.name in edge_program.graph_signature.user_inputs:
275+
input_count += 1
255276
elif node.op == "output":
256277
process_output(node, tosa_graph)
257278
else:
258279
# This will only happen if an unpartitioned graph is passed without
259280
# any checking of compatibility.
260281
dbg_fail(node, tosa_graph, artifact_path)
261282

283+
if len(input_order) > 0:
284+
if input_count != len(input_order):
285+
raise RuntimeError(
286+
"The rank of the input order is not equal to amount of input tensors"
287+
)
288+
262289
# TODO: It would be awesome if this dump could somehow be done on top level and not here.
263290
# Problem is that the desc.json has to be created on the tosa_graph object, which we can't
264291
# access from top level.
@@ -275,7 +302,7 @@ def preprocess( # noqa: C901
275302
# preprocess and some consume TOSA fb directly.
276303
if output_format == "vela":
277304
# Emit vela_bin_stream format
278-
binary = vela_compile(tosa_graph, compile_flags)
305+
binary = vela_compile(tosa_graph, compile_flags, input_order)
279306
elif output_format == "tosa":
280307
# Emit TOSA flatbuffer
281308
binary = bytes(tosa_graph.serialize())

0 commit comments

Comments
 (0)