diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs index 3ca460cd2..71f074cef 100755 --- a/.ci/scripts/run-docs +++ b/.ci/scripts/run-docs @@ -1,145 +1,67 @@ -# /bin/bash -x +#!/bin/bash -x -if [ "X$1" == "X" ]; then +# Check if an argument was provided +if [ -z "$1" ]; then echo "Must specify document to run" exit 1 fi -if [ "$1" == "readme" ]; then - echo "::group::Create script to run README" - python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-readme.sh - echo "::endgroup::" - - echo "::group::Run README" - echo "*******************************************" - cat ./run-readme.sh - echo "*******************************************" - bash -x ./run-readme.sh - echo "::endgroup::" - - exit 0 -fi - -if [ "$1" == "quantization" ]; then - echo "::group::Create script to run quantization" - python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-quantization.sh - echo "::endgroup::" - - echo "::group::Run quantization" - echo "*******************************************" - cat ./run-quantization.sh - echo "*******************************************" - bash -x ./run-quantization.sh - echo "::endgroup::" - - exit 0 -fi - -if [ "$1" == "gguf" ]; then - echo "::group::Create script to run gguf" - python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-gguf.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-gguf.sh - echo "::endgroup::" - - echo "::group::Run gguf" - echo "*******************************************" - cat ./run-gguf.sh - echo "*******************************************" - bash -x ./run-gguf.sh - echo "::endgroup::" -fi - - -if [ "$1" == "advanced" ]; then - echo "::group::Create script to run advanced" - python3 torchchat/utils/scripts/updown.py --file docs/ADVANCED-USERS.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-advanced.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-advanced.sh - echo "::endgroup::" - - echo "::group::Run advanced" - echo "*******************************************" - cat ./run-advanced.sh - echo "*******************************************" - bash -x ./run-advanced.sh - echo "::endgroup::" -fi - -if [ "$1" == "evaluation" ]; then - echo "::group::Create script to run evaluation" - python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-evaluation.sh - echo "::endgroup::" - - echo "::group::Run evaluation" - echo "*******************************************" - cat ./run-evaluation.sh - echo "*******************************************" - bash -x ./run-evaluation.sh -fi - -if [ "$1" == "multimodal" ]; then - - # Expecting that this might fail this test as-is, because - # it's the first on-pr test depending on github secrets for access with HF token access - - echo "::group::Create script to run multimodal" - python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-multimodal.sh - echo "::endgroup::" - - echo "::group::Run multimodal" - echo "*******************************************" - cat ./run-multimodal.sh - echo "*******************************************" - bash -x ./run-multimodal.sh - echo "::endgroup::" -fi - -if [ "$1" == "native" ]; then - - echo "::group::Create script to run native-execution" - python3 torchchat/utils/scripts/updown.py --file docs/native-execution.md > ./run-native.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-native.sh - echo "::endgroup::" - - echo "::group::Run native-execution" - echo "*******************************************" - cat ./run-native.sh - echo "*******************************************" - bash -x ./run-native.sh - echo "::endgroup::" -fi - -if [ "$1" == "distributed" ]; then - - echo "::group::Create script to run distributed" - python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:stories110M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh - python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-distributed.sh - echo "::endgroup::" - - echo "::group::Run distributed" - echo "*******************************************" - cat ./run-distributed.sh - echo "*******************************************" - bash -x ./run-distributed.sh - echo "::endgroup::" -fi +# Pre-initialize variables +filepath="" +parameters="--replace 'llama3:stories15M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN" +script_name="./run-${1}.sh" # Dynamically initialize script name + +# Use a case statement to handle the $1 argument +case "$1" in + "readme") + filepath="README.md" + ;; + "quantization") + filepath="docs/quantization.md" + ;; + "gguf") + filepath="docs/GGUF.md" + ;; + "advanced") + filepath="docs/ADVANCED-USERS.md" + ;; + "evaluation") + filepath="torchchat/utils/docs/evaluation.md" + ;; + "multimodal") + filepath="docs/multimodal.md" + parameters="" # Clear parameters + ;; + "native") + filepath="docs/native-execution.md" + parameters="" # Clear parameters + ;; + "distributed") + filepath="docs/distributed.md" + parameters="--replace 'llama3.1:stories110M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN" # Use stories110M to avoid need for authentication + ;; + "local") + filepath="docs/local-model.md" + parameters="" # Clear parameters + ;; + + *) + echo "Unknown option: $1" + exit 1 + ;; +esac + +# Generate the script +echo "::group::Create script to run $1" +python3 torchchat/utils/scripts/updown.py --file "$filepath" $parameters > "$script_name" +# if something happened to updown processor, and it did not error out, fail with an exit 1 +echo "exit 1" >> "$script_name" +echo "::endgroup::" + +# Run the script +echo "::group::Run $1" +echo "*******************************************" +cat "$script_name" +echo "*******************************************" +bash -x "$script_name" +echo "::endgroup::" diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml index 4d5cd7e14..db16bc80e 100644 --- a/.github/workflows/run-readme-pr-mps.yml +++ b/.github/workflows/run-readme-pr-mps.yml @@ -15,8 +15,8 @@ jobs: conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp conda activate test-readme-mps-macos set -x - # NS: Remove previous installation of torch first - # as this script does not isntall anything into conda env but rather as system dep + # NS: Remove previous installation of torch first + # as this script does not install anything into conda env but rather as system dep pip3 uninstall -y torch || true set -eou pipefail @@ -37,6 +37,7 @@ jobs: uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: runner: macos-m1-14 + timeout: 60 script: | set -x conda create -y -n test-quantization-mps-macos python=3.10.11 diff --git a/README.md b/README.md index 2448b0b72..04fb4789e 100644 --- a/README.md +++ b/README.md @@ -413,7 +413,7 @@ torchchat/utils/scripts/build_native.sh et Execute using the runner ```bash -cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time" +cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time" ``` diff --git a/docs/quantization.md b/docs/quantization.md index 704a7ed6a..56fd2182e 100644 --- a/docs/quantization.md +++ b/docs/quantization.md @@ -182,7 +182,7 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner: ``` -OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time," +OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -i "Once upon a time," # -l 3 ``` #### ExecuTorch @@ -193,7 +193,7 @@ python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"e Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file. It will not work with the `python torchchat.py generate` command. ``` -./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time," +./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l3 -i "Once upon a time," ``` ## Experimental TorchAO MPS lowbit kernels diff --git a/runner/run.cpp b/runner/run.cpp index e5c818cfa..d64c636bb 100644 --- a/runner/run.cpp +++ b/runner/run.cpp @@ -803,41 +803,53 @@ int main(int argc, char *argv[]) { } else { error_usage(); } - for (int i = 2; i < argc; i += 2) { + for (int i = 2; i < argc; i += 1) { // do some basic validation - if (i + 1 >= argc) { - error_usage(); - } // must have arg after flag + char *parm = argv[i+1]; + // uniarg means the arg comes right after the letter in accordance with posix + int uniarg = strlen(argv[i]) > 2; + if (argv[i][0] != '-') { error_usage(); } // must start with dash - if (strlen(argv[i]) != 2) { + + if (strlen(argv[i]) < 2) { error_usage(); - } // must be -x (one dash, one letter) + } // must have at least dash '-' and option letter + + if (uniarg) { + parm=&argv[i][2]; + } else if (i + 1 >= argc) { + error_usage(); + } // must have arg after option if flag is not contiguous to option + // read in the args if (argv[i][1] == 't') { - temperature = atof(argv[i + 1]); + temperature = atof(parm); } else if (argv[i][1] == 'p') { - topp = atof(argv[i + 1]); + topp = atof(parm); } else if (argv[i][1] == 's') { - rng_seed = atoi(argv[i + 1]); + rng_seed = atoi(parm); } else if (argv[i][1] == 'n') { - steps = atoi(argv[i + 1]); + steps = atoi(parm); } else if (argv[i][1] == 'v') { - vocab_size = atoi(argv[i + 1]); + vocab_size = atoi(parm); } else if (argv[i][1] == 'i') { - prompt = argv[i + 1]; + prompt = parm; } else if (argv[i][1] == 'z') { - tokenizer_path = argv[i + 1]; + tokenizer_path = parm; } else if (argv[i][1] == 'm') { - mode = argv[i + 1]; + mode = parm; } else if (argv[i][1] == 'y') { - system_prompt = argv[i + 1]; + system_prompt = parm; } else if (argv[i][1] == 'l') { - llama_ver = atoi(argv[i + 1]); + llama_ver = atoi(parm); } else { error_usage(); } + + // account for parameter + i += (uniarg)?0:1; } if (model_path == NULL) { diff --git a/torchchat/quant_config/cuda-32.json b/torchchat/quant_config/cuda-32.json new file mode 100644 index 000000000..90c37250a --- /dev/null +++ b/torchchat/quant_config/cuda-32.json @@ -0,0 +1,5 @@ +{ + "executor": {"accelerator": "cuda"}, + "precision": {"dtype": "bf16"}, + "linear:int4": {"groupsize" : 32} +} diff --git a/torchchat/quant_config/mobile-32.json b/torchchat/quant_config/mobile-32.json new file mode 100644 index 000000000..3afaa7542 --- /dev/null +++ b/torchchat/quant_config/mobile-32.json @@ -0,0 +1,4 @@ +{ + "embedding": {"bitwidth": 4, "groupsize" : 32}, + "linear:a8w4dq": {"groupsize" : 32} +}