2024-10-19 nightly release (995dcaf)

pytorchbot · pytorchbot · commit d462f4e74b88 · 2024-10-19T11:35:28.000Z
diff --git a/.ci/scripts/test_eval_llama_mmlu.sh b/.ci/scripts/test_eval_llama_mmlu.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+    PYTHON_EXECUTABLE=python3
+fi
+
+# Download and prepare stories model artifacts
+prepare_model_artifacts() {
+    echo "Preparing stories model artifacts"
+    wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+    wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+    echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run eval_llama at ${NOW}"
+    if [[ ! -f "stories110M.pt" ]]; then
+        echo "stories110M.pt is missing."
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.model" ]]; then
+        echo "tokenizer.model is missing."
+        exit 1
+    fi
+    if [[ ! -f "params.json" ]]; then
+        echo "params.json is missing."
+        exit 1
+    fi
+    $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
+	-c stories110M.pt \
+	-p params.json \
+	-t tokenizer.model \
+	-kv \
+	-d fp32 \
+	--tasks mmlu \
+	-f 5 \
+	--max_seq_length 2048 \
+	--limit 5 > result.txt
+
+    # Verify result.txt
+    RESULT=$(cat result.txt)
+    EXPECTED_TASK="mmlu"
+    EXPECTED_RESULT="acc"
+    if [[ "${RESULT}" == "${EXPECTED_TASK}: {"*"${EXPECTED_RESULT}"* ]]; then
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+prepare_model_artifacts
+run_and_verify
diff --git a/.ci/scripts/test_eval_llama_wikitext.sh b/.ci/scripts/test_eval_llama_wikitext.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+    PYTHON_EXECUTABLE=python3
+fi
+
+# Download and prepare stories model artifacts
+prepare_model_artifacts() {
+    echo "Preparing stories model artifacts"
+    wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+    wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+    echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run eval_llama at ${NOW}"
+    if [[ ! -f "stories110M.pt" ]]; then
+        echo "stories110M.pt is missing."
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.model" ]]; then
+        echo "tokenizer.model is missing."
+        exit 1
+    fi
+    if [[ ! -f "params.json" ]]; then
+        echo "params.json is missing."
+        exit 1
+    fi
+    $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \
+	-c stories110M.pt \
+	-p params.json \
+	-t tokenizer.model \
+	-kv \
+	-d fp32 \
+	--max_seq_length 2048 \
+	--limit 5 > result.txt
+
+    # Verify result.txt
+    RESULT=$(cat result.txt)
+    EXPECTED_TASK="wikitext"
+    EXPECTED_RESULT="word_perplexity"
+    if [[ "${RESULT}" == "${EXPECTED_TASK}: {"*"${EXPECTED_RESULT}"* ]]; then
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+prepare_model_artifacts
+run_and_verify
diff --git a/.ci/scripts/test_llama_runner_eager.sh b/.ci/scripts/test_llama_runner_eager.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+    PYTHON_EXECUTABLE=python3
+fi
+
+# Download and prepare stories model artifacts
+prepare_model_artifacts() {
+    echo "Preparing stories model artifacts"
+    wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+    wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+    echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run eval_llama at ${NOW}"
+    if [[ ! -f "stories110M.pt" ]]; then
+        echo "stories110M.pt is missing."
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.model" ]]; then
+        echo "tokenizer.model is missing."
+        exit 1
+    fi
+    if [[ ! -f "params.json" ]]; then
+        echo "params.json is missing."
+        exit 1
+    fi
+    $PYTHON_EXECUTABLE -m examples.models.llama.runner.eager \
+	-c stories110M.pt \
+	-p params.json \
+	-t tokenizer.model \
+	-kv \
+	-d fp32 \
+	--max_seq_length 32 \
+	--temperature 0 \
+	--prompt "Once upon a time," > result.txt
+
+    # Verify result.txt
+    RESULT=$(cat result.txt)
+    EXPECTED_RESULT="there was a little girl"
+    if [[ "${RESULT}" == *"${EXPECTED_RESULT}"* ]]; then
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+prepare_model_artifacts
+run_and_verify
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -447,3 +447,84 @@ jobs:
 
         # run e2e (export, tokenizer and runner)
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
+
+  test-eval_llama-wikitext-linux:
+    name: test-eval_llama-wikitext-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # install llama requirements
+        bash examples/models/llama/install_requirements.sh
+
+        # run eval_llama wikitext task
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_wikitext.sh
+
+  test-eval_llama-mmlu-linux:
+    name: test-eval_llama-mmlu-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # install llama requirements
+        bash examples/models/llama/install_requirements.sh
+
+        # run eval_llama mmlu task
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_mmlu.sh
+
+  test-llama_runner_eager-linux:
+    name: test-llama_runner_eager-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # install llama requirements
+        bash examples/models/llama/install_requirements.sh
+
+        # run llama runner in eager mode
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
@@ -22,4 +22,6 @@ jobs:
           stable-branch: viable/strict
           requires: '[\"pull\", \"lint\", \"trunk\", \"Build documentation\", \"^Android$\", \"^Apple$\"]'
           secret-bot-token: ${{ secrets.UPDATEBOT_TOKEN }}
-          rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}
+          clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
+          clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
+          clickhouse-password: ${{ secrets.CLICKHOUSE_VIABLESTRICT_PASSWORD }}
diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py
@@ -291,6 +291,13 @@ def eval_llama(
     # Generate the eval wrapper
     eval_wrapper = gen_eval_wrapper(model_name, args)
 
+    # Needed for loading mmlu dataset.
+    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1998/files
+    if args.tasks and "mmlu" in args.tasks:
+        import datasets
+
+        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+
     # Evaluate the model
     with torch.no_grad():
         eval_results = simple_evaluate(
diff --git a/examples/models/llama/evaluate/eager_eval.py b/examples/models/llama/evaluate/eager_eval.py
@@ -40,7 +40,12 @@ def __init__(
 
     @property
     def eot_token_id(self):
-        return self._tokenizer.eot_id
+        """
+        The stories model does not have an EOT token, so we use the EOS token instead.
+        """
+        if hasattr(self._tokenizer, "eot_id"):
+            return self._tokenizer.eot_id
+        return self._tokenizer.eos_id
 
     @property
     def max_length(self):
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -116,6 +116,9 @@ class ModelArgs:
     bos_count: int = -1  # i.e., a single EOS is used as BOS
     eos_count: int = 2
 
+    quantization_args: Optional[dict] = None
+    lora_args: Optional[dict] = None
+
     def __post_init__(self):
         if self.n_kv_heads is None:
             self.n_kv_heads = self.n_heads
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -165,7 +165,7 @@ def __init__(self, **kwargs):
             )
         elif hasattr(self.args, "use_spin_quant") and self.args.use_spin_quant:
             print("Using SPIN quantization.")
-            self._transform_for_pre_quantization(checkpoint)
+            self._transform_for_pre_quantization(checkpoint, model_args)
 
             from .source_transformation.pre_quantization import (
                 sanitize_checkpoint_from_pre_quantization,
@@ -174,8 +174,9 @@ def __init__(self, **kwargs):
             sanitize_checkpoint_from_pre_quantization(checkpoint)
         elif hasattr(self.args, "use_qat") and self.args.use_qat:
             print("Using QAT quantization.")
-            self._transform_for_pre_quantization(checkpoint)
+            self._transform_for_pre_quantization(checkpoint, model_args)
             if hasattr(self.args, "use_lora") and self.args.use_lora:
+                assert model_args.lora_args["rank"] == self.args.use_lora
                 from .source_transformation.lora import (
                     transform_linear_for_lora_after_quantization,
                 )
@@ -251,7 +252,7 @@ def get_example_inputs_kvcache_sdpa(self):
                 ),  # start_pos, what token of output are we on.
             )
 
-    def _transform_for_pre_quantization(self, checkpoint):
+    def _transform_for_pre_quantization(self, checkpoint, model_args):
         assert hasattr(self.args, "preq_mode"), "preq_mode must be specified"
         assert self.args.preq_mode in [
             "8da4w",
@@ -264,6 +265,7 @@ def _transform_for_pre_quantization(self, checkpoint):
         from .source_transformation.pre_quantization import (
             transform_linear_for_pre_quantization,
         )
+        assert self.args.preq_group_size == model_args.quantization_args["group_size"]
 
         mapping = {
             "fp32": torch.float32,
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
@@ -11,11 +11,11 @@
 import torch
 
 from examples.models.llama.llama_transformer import ModelArgs
-from executorch.examples.models.llama2.export_llama_lib import (
+from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
 )
-from executorch.examples.models.llama2.runner.generation import LlamaRunner
+from executorch.examples.models.llama.runner.generation import LlamaRunner
 from executorch.extension.llm.export import LLMEdgeManager
 
 
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
diff --git a/examples/portable/custom_ops/custom_ops_1_out.cpp b/examples/portable/custom_ops/custom_ops_1_out.cpp
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
diff --git a/extension/llm/tokenizer/targets.bzl b/extension/llm/tokenizer/targets.bzl
diff --git a/kernels/test/custom_kernel_example/op_relu.cpp b/kernels/test/custom_kernel_example/op_relu.cpp