Skip to content

Commit 480d5a8

Browse files
committed
Update on "[ET-VK] Implement prepack nodes"
## Context This diff implements the idea described in the previous diff in this stack. During export, `et_vk.prepack` nodes will be inserted to convert constant tensors to GPU tensor objects. This makes it so that Vulkan operators will not have to account for the possibility that their arguments can potentially be constant tensor data instead of an actual tensor object. Differential Revision: [D64603666](https://our.internmc.facebook.com/intern/diff/D64603666/) [ghstack-poisoned]
2 parents 98c57a6 + e74b97e commit 480d5a8

File tree

12 files changed

+341
-11
lines changed

12 files changed

+341
-11
lines changed
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
10+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
11+
PYTHON_EXECUTABLE=python3
12+
fi
13+
14+
# Download and prepare stories model artifacts
15+
prepare_model_artifacts() {
16+
echo "Preparing stories model artifacts"
17+
wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
18+
wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
19+
echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
20+
}
21+
22+
run_and_verify() {
23+
NOW=$(date +"%H:%M:%S")
24+
echo "Starting to run eval_llama at ${NOW}"
25+
if [[ ! -f "stories110M.pt" ]]; then
26+
echo "stories110M.pt is missing."
27+
exit 1
28+
fi
29+
if [[ ! -f "tokenizer.model" ]]; then
30+
echo "tokenizer.model is missing."
31+
exit 1
32+
fi
33+
if [[ ! -f "params.json" ]]; then
34+
echo "params.json is missing."
35+
exit 1
36+
fi
37+
$PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
38+
-c stories110M.pt \
39+
-p params.json \
40+
-t tokenizer.model \
41+
-kv \
42+
-d fp32 \
43+
--tasks mmlu \
44+
-f 5 \
45+
--max_seq_length 2048 \
46+
--limit 5 > result.txt
47+
48+
# Verify result.txt
49+
RESULT=$(cat result.txt)
50+
EXPECTED_TASK="mmlu"
51+
EXPECTED_RESULT="acc"
52+
if [[ "${RESULT}" == "${EXPECTED_TASK}: {"*"${EXPECTED_RESULT}"* ]]; then
53+
echo "Actual result: ${RESULT}"
54+
echo "Success"
55+
exit 0
56+
else
57+
echo "Actual result: ${RESULT}"
58+
echo "Failure; results not the same"
59+
exit 1
60+
fi
61+
}
62+
63+
prepare_model_artifacts
64+
run_and_verify
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
10+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
11+
PYTHON_EXECUTABLE=python3
12+
fi
13+
14+
# Download and prepare stories model artifacts
15+
prepare_model_artifacts() {
16+
echo "Preparing stories model artifacts"
17+
wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
18+
wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
19+
echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
20+
}
21+
22+
run_and_verify() {
23+
NOW=$(date +"%H:%M:%S")
24+
echo "Starting to run eval_llama at ${NOW}"
25+
if [[ ! -f "stories110M.pt" ]]; then
26+
echo "stories110M.pt is missing."
27+
exit 1
28+
fi
29+
if [[ ! -f "tokenizer.model" ]]; then
30+
echo "tokenizer.model is missing."
31+
exit 1
32+
fi
33+
if [[ ! -f "params.json" ]]; then
34+
echo "params.json is missing."
35+
exit 1
36+
fi
37+
$PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
38+
-c stories110M.pt \
39+
-p params.json \
40+
-t tokenizer.model \
41+
-kv \
42+
-d fp32 \
43+
--max_seq_length 2048 \
44+
--limit 5 > result.txt
45+
46+
# Verify result.txt
47+
RESULT=$(cat result.txt)
48+
EXPECTED_TASK="wikitext"
49+
EXPECTED_RESULT="word_perplexity"
50+
if [[ "${RESULT}" == "${EXPECTED_TASK}: {"*"${EXPECTED_RESULT}"* ]]; then
51+
echo "Actual result: ${RESULT}"
52+
echo "Success"
53+
exit 0
54+
else
55+
echo "Actual result: ${RESULT}"
56+
echo "Failure; results not the same"
57+
exit 1
58+
fi
59+
}
60+
61+
prepare_model_artifacts
62+
run_and_verify
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
10+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
11+
PYTHON_EXECUTABLE=python3
12+
fi
13+
14+
# Download and prepare stories model artifacts
15+
prepare_model_artifacts() {
16+
echo "Preparing stories model artifacts"
17+
wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
18+
wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
19+
echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
20+
}
21+
22+
run_and_verify() {
23+
NOW=$(date +"%H:%M:%S")
24+
echo "Starting to run eval_llama at ${NOW}"
25+
if [[ ! -f "stories110M.pt" ]]; then
26+
echo "stories110M.pt is missing."
27+
exit 1
28+
fi
29+
if [[ ! -f "tokenizer.model" ]]; then
30+
echo "tokenizer.model is missing."
31+
exit 1
32+
fi
33+
if [[ ! -f "params.json" ]]; then
34+
echo "params.json is missing."
35+
exit 1
36+
fi
37+
$PYTHON_EXECUTABLE -m examples.models.llama.runner.eager \
38+
-c stories110M.pt \
39+
-p params.json \
40+
-t tokenizer.model \
41+
-kv \
42+
-d fp32 \
43+
--max_seq_length 32 \
44+
--temperature 0 \
45+
--prompt "Once upon a time," > result.txt
46+
47+
# Verify result.txt
48+
RESULT=$(cat result.txt)
49+
EXPECTED_RESULT="there was a little girl"
50+
if [[ "${RESULT}" == *"${EXPECTED_RESULT}"* ]]; then
51+
echo "Actual result: ${RESULT}"
52+
echo "Success"
53+
exit 0
54+
else
55+
echo "Actual result: ${RESULT}"
56+
echo "Failure; results not the same"
57+
exit 1
58+
fi
59+
}
60+
61+
prepare_model_artifacts
62+
run_and_verify

.github/workflows/pull.yml

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,3 +447,84 @@ jobs:
447447
448448
# run e2e (export, tokenizer and runner)
449449
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
450+
451+
test-eval_llama-wikitext-linux:
452+
name: test-eval_llama-wikitext-linux
453+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
454+
strategy:
455+
fail-fast: false
456+
with:
457+
runner: linux.24xlarge
458+
docker-image: executorch-ubuntu-22.04-clang12
459+
submodules: 'true'
460+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
461+
timeout: 90
462+
script: |
463+
# The generic Linux job chooses to use base env, not the one setup by the image
464+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
465+
conda activate "${CONDA_ENV}"
466+
467+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
468+
469+
# install pybind
470+
bash install_requirements.sh --pybind xnnpack
471+
472+
# install llama requirements
473+
bash examples/models/llama/install_requirements.sh
474+
475+
# run eval_llama wikitext task
476+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_wikitext.sh
477+
478+
test-eval_llama-mmlu-linux:
479+
name: test-eval_llama-mmlu-linux
480+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
481+
strategy:
482+
fail-fast: false
483+
with:
484+
runner: linux.24xlarge
485+
docker-image: executorch-ubuntu-22.04-clang12
486+
submodules: 'true'
487+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
488+
timeout: 90
489+
script: |
490+
# The generic Linux job chooses to use base env, not the one setup by the image
491+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
492+
conda activate "${CONDA_ENV}"
493+
494+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
495+
496+
# install pybind
497+
bash install_requirements.sh --pybind xnnpack
498+
499+
# install llama requirements
500+
bash examples/models/llama/install_requirements.sh
501+
502+
# run eval_llama mmlu task
503+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_mmlu.sh
504+
505+
test-llama_runner_eager-linux:
506+
name: test-llama_runner_eager-linux
507+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
508+
strategy:
509+
fail-fast: false
510+
with:
511+
runner: linux.24xlarge
512+
docker-image: executorch-ubuntu-22.04-clang12
513+
submodules: 'true'
514+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
515+
timeout: 90
516+
script: |
517+
# The generic Linux job chooses to use base env, not the one setup by the image
518+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
519+
conda activate "${CONDA_ENV}"
520+
521+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
522+
523+
# install pybind
524+
bash install_requirements.sh --pybind xnnpack
525+
526+
# install llama requirements
527+
bash examples/models/llama/install_requirements.sh
528+
529+
# run llama runner in eager mode
530+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh

.github/workflows/update-viablestrict.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,6 @@ jobs:
2222
stable-branch: viable/strict
2323
requires: '[\"pull\", \"lint\", \"trunk\", \"Build documentation\", \"^Android$\", \"^Apple$\"]'
2424
secret-bot-token: ${{ secrets.UPDATEBOT_TOKEN }}
25-
rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}
25+
clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
26+
clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
27+
clickhouse-password: ${{ secrets.CLICKHOUSE_VIABLESTRICT_PASSWORD }}

backends/transforms/fuse_conv_with_clamp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def call(self, graph_module: torch.fx.GraphModule):
6565
with graph_module.graph.inserting_before(preceding_op):
6666
conv_activation_node = graph_module.graph.create_node(
6767
"call_function",
68-
torch.ops.et_vk.conv_with_clamp.default,
68+
exir_ops.edge.et_vk.conv_with_clamp.default,
6969
new_args,
7070
)
7171

examples/models/llama/eval_llama_lib.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,13 @@ def eval_llama(
291291
# Generate the eval wrapper
292292
eval_wrapper = gen_eval_wrapper(model_name, args)
293293

294+
# Needed for loading mmlu dataset.
295+
# See https://github.com/EleutherAI/lm-evaluation-harness/pull/1998/files
296+
if args.tasks and "mmlu" in args.tasks:
297+
import datasets
298+
299+
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
300+
294301
# Evaluate the model
295302
with torch.no_grad():
296303
eval_results = simple_evaluate(

examples/models/llama/evaluate/eager_eval.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,12 @@ def __init__(
4040

4141
@property
4242
def eot_token_id(self):
43-
return self._tokenizer.eot_id
43+
"""
44+
The stories model does not have an EOT token, so we use the EOS token instead.
45+
"""
46+
if hasattr(self._tokenizer, "eot_id"):
47+
return self._tokenizer.eot_id
48+
return self._tokenizer.eos_id
4449

4550
@property
4651
def max_length(self):

examples/models/llama/runner/eager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111
import torch
1212

1313
from examples.models.llama.llama_transformer import ModelArgs
14-
from executorch.examples.models.llama2.export_llama_lib import (
14+
from executorch.examples.models.llama.export_llama_lib import (
1515
_prepare_for_llama_export,
1616
build_args_parser as _build_args_parser,
1717
)
18-
from executorch.examples.models.llama2.runner.generation import LlamaRunner
18+
from executorch.examples.models.llama.runner.generation import LlamaRunner
1919
from executorch.extension.llm.export import LLMEdgeManager
2020

2121

examples/models/llama/runner/generation.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import torch
1111

1212
from executorch.examples.models.llama.llama_transformer import ModelArgs
13-
from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer
13+
from executorch.extension.llm.tokenizer.utils import get_tokenizer
1414

1515

1616
class CompletionPrediction(TypedDict, total=False):
@@ -53,7 +53,7 @@ def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int:
5353
class LlamaRunner(ABC):
5454
def __init__(self, tokenizer_path: str, model_args: ModelArgs):
5555
self.params = model_args
56-
self.tokenizer = Tokenizer(tokenizer_path)
56+
self.tokenizer = get_tokenizer(tokenizer_path)
5757
assert model_args.vocab_size == self.tokenizer.n_words
5858

5959
@abstractmethod
@@ -93,7 +93,9 @@ def generate( # noqa: C901
9393
else:
9494
logits = self.forward(tokens=torch.tensor([tokens], dtype=torch.long))
9595
current_token = next_token(logits, temperature, top_p)
96-
if current_token in self.tokenizer.stop_tokens:
96+
if current_token == self.tokenizer.eos_id or (
97+
hasattr(self, "stop_tokens") and current_token in self.stop_tokens
98+
):
9799
break
98100
tokens.append(current_token)
99101

0 commit comments

Comments
 (0)