Update on "[ET-VK] Implement prepack nodes"

SS-JIA · SS-JIA · commit 480d5a81395d · 2024-10-18T14:13:32.000-07:00
## Context This diff implements the idea described in the previous diff in this stack. During export, `et_vk.prepack` nodes will be inserted to convert constant tensors to GPU tensor objects. This makes it so that Vulkan operators will not have to account for the possibility that their arguments can potentially be constant tensor data instead of an actual tensor object. Differential Revision: [D64603666](https://our.internmc.facebook.com/intern/diff/D64603666/) [ghstack-poisoned]
diff --git a/.ci/scripts/test_eval_llama_mmlu.sh b/.ci/scripts/test_eval_llama_mmlu.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+    PYTHON_EXECUTABLE=python3
+fi
+
+# Download and prepare stories model artifacts
+prepare_model_artifacts() {
+    echo "Preparing stories model artifacts"
+    wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+    wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+    echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run eval_llama at ${NOW}"
+    if [[ ! -f "stories110M.pt" ]]; then
+        echo "stories110M.pt is missing."
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.model" ]]; then
+        echo "tokenizer.model is missing."
+        exit 1
+    fi
+    if [[ ! -f "params.json" ]]; then
+        echo "params.json is missing."
+        exit 1
+    fi
+    $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
+	-c stories110M.pt \
+	-p params.json \
+	-t tokenizer.model \
+	-kv \
+	-d fp32 \
+	--tasks mmlu \
+	-f 5 \
+	--max_seq_length 2048 \
+	--limit 5 > result.txt
+
+    # Verify result.txt
+    RESULT=$(cat result.txt)
+    EXPECTED_TASK="mmlu"
+    EXPECTED_RESULT="acc"
+    if [[ "${RESULT}" == "${EXPECTED_TASK}: {"*"${EXPECTED_RESULT}"* ]]; then
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+prepare_model_artifacts
+run_and_verify
diff --git a/.ci/scripts/test_eval_llama_wikitext.sh b/.ci/scripts/test_eval_llama_wikitext.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+    PYTHON_EXECUTABLE=python3
+fi
+
+# Download and prepare stories model artifacts
+prepare_model_artifacts() {
+    echo "Preparing stories model artifacts"
+    wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+    wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+    echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run eval_llama at ${NOW}"
+    if [[ ! -f "stories110M.pt" ]]; then
+        echo "stories110M.pt is missing."
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.model" ]]; then
+        echo "tokenizer.model is missing."
+        exit 1
+    fi
+    if [[ ! -f "params.json" ]]; then
+        echo "params.json is missing."
+        exit 1
+    fi
+    $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
+	-c stories110M.pt \
+	-p params.json \
+	-t tokenizer.model \
+	-kv \
+	-d fp32 \
+	--max_seq_length 2048 \
+	--limit 5 > result.txt
+
+    # Verify result.txt
+    RESULT=$(cat result.txt)
+    EXPECTED_TASK="wikitext"
+    EXPECTED_RESULT="word_perplexity"
+    if [[ "${RESULT}" == "${EXPECTED_TASK}: {"*"${EXPECTED_RESULT}"* ]]; then
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+prepare_model_artifacts
+run_and_verify
diff --git a/.ci/scripts/test_llama_runner_eager.sh b/.ci/scripts/test_llama_runner_eager.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+    PYTHON_EXECUTABLE=python3
+fi
+
+# Download and prepare stories model artifacts
+prepare_model_artifacts() {
+    echo "Preparing stories model artifacts"
+    wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+    wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+    echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run eval_llama at ${NOW}"
+    if [[ ! -f "stories110M.pt" ]]; then
+        echo "stories110M.pt is missing."
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.model" ]]; then
+        echo "tokenizer.model is missing."
+        exit 1
+    fi
+    if [[ ! -f "params.json" ]]; then
+        echo "params.json is missing."
+        exit 1
+    fi
+    $PYTHON_EXECUTABLE -m examples.models.llama.runner.eager \
+	-c stories110M.pt \
+	-p params.json \
+	-t tokenizer.model \
+	-kv \
+	-d fp32 \
+	--max_seq_length 32 \
+	--temperature 0 \
+	--prompt "Once upon a time," > result.txt
+
+    # Verify result.txt
+    RESULT=$(cat result.txt)
+    EXPECTED_RESULT="there was a little girl"
+    if [[ "${RESULT}" == *"${EXPECTED_RESULT}"* ]]; then
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+prepare_model_artifacts
+run_and_verify
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -447,3 +447,84 @@ jobs:
 
         # run e2e (export, tokenizer and runner)
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
+
+  test-eval_llama-wikitext-linux:
+    name: test-eval_llama-wikitext-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # install llama requirements
+        bash examples/models/llama/install_requirements.sh
+
+        # run eval_llama wikitext task
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_wikitext.sh
+
+  test-eval_llama-mmlu-linux:
+    name: test-eval_llama-mmlu-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # install llama requirements
+        bash examples/models/llama/install_requirements.sh
+
+        # run eval_llama mmlu task
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_mmlu.sh
+
+  test-llama_runner_eager-linux:
+    name: test-llama_runner_eager-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # install llama requirements
+        bash examples/models/llama/install_requirements.sh
+
+        # run llama runner in eager mode
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
@@ -22,4 +22,6 @@ jobs:
           stable-branch: viable/strict
           requires: '[\"pull\", \"lint\", \"trunk\", \"Build documentation\", \"^Android$\", \"^Apple$\"]'
           secret-bot-token: ${{ secrets.UPDATEBOT_TOKEN }}
-          rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}
+          clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
+          clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
+          clickhouse-password: ${{ secrets.CLICKHOUSE_VIABLESTRICT_PASSWORD }}
diff --git a/backends/transforms/fuse_conv_with_clamp.py b/backends/transforms/fuse_conv_with_clamp.py
@@ -65,7 +65,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                         with graph_module.graph.inserting_before(preceding_op):
                             conv_activation_node = graph_module.graph.create_node(
                                 "call_function",
-                                torch.ops.et_vk.conv_with_clamp.default,
+                                exir_ops.edge.et_vk.conv_with_clamp.default,
                                 new_args,
                             )
 
diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py
@@ -291,6 +291,13 @@ def eval_llama(
     # Generate the eval wrapper
     eval_wrapper = gen_eval_wrapper(model_name, args)
 
+    # Needed for loading mmlu dataset.
+    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1998/files
+    if args.tasks and "mmlu" in args.tasks:
+        import datasets
+
+        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+
     # Evaluate the model
     with torch.no_grad():
         eval_results = simple_evaluate(
diff --git a/examples/models/llama/evaluate/eager_eval.py b/examples/models/llama/evaluate/eager_eval.py
@@ -40,7 +40,12 @@ def __init__(
 
     @property
     def eot_token_id(self):
-        return self._tokenizer.eot_id
+        """
+        The stories model does not have an EOT token, so we use the EOS token instead.
+        """
+        if hasattr(self._tokenizer, "eot_id"):
+            return self._tokenizer.eot_id
+        return self._tokenizer.eos_id
 
     @property
     def max_length(self):
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
@@ -11,11 +11,11 @@
 import torch
 
 from examples.models.llama.llama_transformer import ModelArgs
-from executorch.examples.models.llama2.export_llama_lib import (
+from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
 )
-from executorch.examples.models.llama2.runner.generation import LlamaRunner
+from executorch.examples.models.llama.runner.generation import LlamaRunner
 from executorch.extension.llm.export import LLMEdgeManager
 
 
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -10,7 +10,7 @@
 import torch
 
 from executorch.examples.models.llama.llama_transformer import ModelArgs
-from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer
+from executorch.extension.llm.tokenizer.utils import get_tokenizer
 
 
 class CompletionPrediction(TypedDict, total=False):
@@ -53,7 +53,7 @@ def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int:
 class LlamaRunner(ABC):
     def __init__(self, tokenizer_path: str, model_args: ModelArgs):
         self.params = model_args
-        self.tokenizer = Tokenizer(tokenizer_path)
+        self.tokenizer = get_tokenizer(tokenizer_path)
         assert model_args.vocab_size == self.tokenizer.n_words
 
     @abstractmethod
@@ -93,7 +93,9 @@ def generate(  # noqa: C901
             else:
                 logits = self.forward(tokens=torch.tensor([tokens], dtype=torch.long))
             current_token = next_token(logits, temperature, top_p)
-            if current_token in self.tokenizer.stop_tokens:
+            if current_token == self.tokenizer.eos_id or (
+                hasattr(self, "stop_tokens") and current_token in self.stop_tokens
+            ):
                 break
             tokens.append(current_token)
 
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
diff --git a/extension/llm/tokenizer/targets.bzl b/extension/llm/tokenizer/targets.bzl

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ def call(self, graph_module: torch.fx.GraphModule):`
`65`	`65`	`with graph_module.graph.inserting_before(preceding_op):`
`66`	`66`	`conv_activation_node = graph_module.graph.create_node(`
`67`	`67`	`"call_function",`
`68`		`- torch.ops.et_vk.conv_with_clamp.default,`
	`68`	`+ exir_ops.edge.et_vk.conv_with_clamp.default,`
`69`	`69`	`new_args,`
`70`	`70`	`)`
`71`	`71`