Merge branch 'main' into new-intx-quantizer

Jack-Khuu · web-flow · commit 0e27952096bf · 2025-01-17T13:50:00.000-08:00
diff --git a/.github/workflows/more-tests.yml b/.github/workflows/more-tests.yml
@@ -9,6 +9,9 @@ on:
 
 jobs:
   test-cuda:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
@@ -108,6 +108,9 @@ jobs:
           set -eux
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "gpu"
   test-gpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -215,6 +215,9 @@ jobs:
           set -eux
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu"
   test-gpu-compile:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
@@ -250,6 +253,9 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-bfloat16:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-bfloat16 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
@@ -286,6 +292,9 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-float32:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-float32 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
@@ -327,6 +336,9 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-float16:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-float16 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
@@ -369,6 +381,9 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-eval-sanity-check:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
@@ -1011,6 +1026,9 @@ jobs:
           echo "Tests complete."
 
   test-build-runner-et-android:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.4xlarge
diff --git a/.github/workflows/run-readme-periodic.yml b/.github/workflows/run-readme-periodic.yml
@@ -10,6 +10,9 @@ on:
 
 jobs:
   test-readme:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     secrets: inherit
     with:
@@ -39,6 +42,9 @@ jobs:
 
 
   test-quantization-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -66,6 +72,9 @@ jobs:
         echo "::endgroup::"
 
   test-gguf-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     secrets: inherit
     with:
diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
@@ -10,7 +10,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-14
-      timeout-minutes: 50
+      timeout: 50
       script: |
           conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
           conda activate test-readme-mps-macos
@@ -36,7 +36,7 @@ jobs:
   test-quantization-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-14  
+      runner: macos-m1-14
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
@@ -9,6 +9,9 @@ on:
 
 jobs:
   test-readme-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -28,6 +31,9 @@ jobs:
         echo "::endgroup::"
 
   test-readme-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -47,6 +53,9 @@ jobs:
         echo "::endgroup::"
 
   test-quantization-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -66,6 +75,9 @@ jobs:
         echo "::endgroup::"
 
   test-quantization-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -80,6 +92,9 @@ jobs:
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
 
   test-gguf-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -99,6 +114,9 @@ jobs:
         echo "::endgroup::"
 
   test-gguf-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -119,6 +137,9 @@ jobs:
 
 
   test-advanced-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -139,6 +160,9 @@ jobs:
 
 
   test-advanced-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -158,6 +182,9 @@ jobs:
         echo "::endgroup::"
 
   test-evaluation-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -177,6 +204,9 @@ jobs:
         echo "::endgroup::"
 
   test-evaluation-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -196,6 +226,9 @@ jobs:
         echo "::endgroup::"
 
   test-multimodal-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -215,6 +248,9 @@ jobs:
         echo "::endgroup::"
 
   test-multimodal-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -269,4 +305,4 @@ jobs:
         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
         echo "::endgroup::"
 
-        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native  
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native
diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml
@@ -9,6 +9,9 @@ on:
 
 jobs:
   test-runner-aot-cuda:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -52,7 +55,7 @@ jobs:
 
             python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-aoti-package-path /tmp/model.pt2
 
-            ./cmake-out/aoti_run /tmp/model.pt2 -d CUDA -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
+            ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
 
         done
 
diff --git a/README.md b/README.md
@@ -341,7 +341,7 @@ torchchat/utils/scripts/build_native.sh aoti
 
 Then run the compiled executable, with the pt2.
 ```bash
-cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
+cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
 ```
 
 ## Mobile Execution
diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
@@ -1 +1 @@
-98e4dd524f2cb08414ee015b27616229cabc06ba
+9c043290ad3944268290e015c3063bc411e6ef6b
diff --git a/runner/run.cpp b/runner/run.cpp
@@ -102,6 +102,7 @@ typedef struct {
 typedef struct {
   Config config;  // the hyperparameters of the architecture (the blueprint)
   RunState state; // buffers for the "wave" of activations in the forward pass
+  std::unordered_map<std::string, std::string> metadata;
 
 #ifdef __AOTI_MODEL__
   torch::inductor::AOTIModelPackageLoader *runner;
@@ -141,20 +142,9 @@ void read_checkpoint(char *checkpoint, Config *config) {
   config->vocab_size = abs(config->vocab_size);
 }
 
-void build_transformer(Transformer *t, char *model_path, int vocab_size,
-                       int seq_len) {
-  // read in the Config and the Weights from the model
-  // read_checkpoint(model_path, &t->config);
-  // allocate the RunState buffers
-  t->config.vocab_size = vocab_size;
-  t->config.seq_len = seq_len;
-  malloc_run_state(&t->state, &t->config);
-
+void build_transformer(Transformer *t, char *model_path) {
 #ifdef __AOTI_MODEL__
   t->runner = new torch::inductor::AOTIModelPackageLoader(model_path);
-  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu"
-                    ? torch::Device(torch::kCPU)
-                    : torch::Device(torch::kCUDA);
 #else //__ET_MODEL__
   t->runner = new Module(
       /* path to PTE model */ model_path,
@@ -776,9 +766,6 @@ void error_usage() {
           "  -v <int>    (optional) vocab size, default is model-specific.\n");
   fprintf(stderr,
           "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
-  fprintf(
-      stderr,
-      "  -d <string> (optional) device(CUDA or CPU)  model was exported for\n");
   exit(EXIT_FAILURE);
 }
 
@@ -848,37 +835,35 @@ int main(int argc, char *argv[]) {
       system_prompt = argv[i + 1];
     } else if (argv[i][1] == 'l') {
       llama_ver = atoi(argv[i + 1]);
-#ifdef __AOTI_MODEL__
-    } else if (argv[i][1] == 'd') {
-#ifdef USE_CUDA
-      if (strcasecmp(argv[i + 1], "CUDA") == 0) {
-        aoti_device = torch::Device(torch::kCUDA);
-      } else
-#endif
-          if (strcasecmp(argv[i + 1], "CPU") == 0) {
-        aoti_device = torch::Device(torch::kCPU);
-      } else {
-        fprintf(stderr, "Unknown device %s", argv[i + 1]);
-        exit(1);
-      }
-#endif
     } else {
       error_usage();
     }
   }
 
+  if (model_path == NULL) {
+    fprintf(stderr, "No model_path provided.");
+    error_usage();
+  }
+
+  Transformer transformer;
+  build_transformer(&transformer, model_path);
+
+#ifdef __AOTI_MODEL__
+  auto aoti_metadata = transformer.runner->get_metadata();
+  aoti_device = aoti_metadata["AOTI_DEVICE_KEY"] == "cpu"
+                    ? torch::Device(torch::kCPU)
+                    : torch::Device(torch::kCUDA);
+  ModelType model_type = get_model_type(std::stoi(aoti_metadata["tokenizer_type"]));
+#else // __ET_MODEL__
   ModelType model_type = get_model_type(llama_ver);
+#endif
+
   if (model_type == UNKNOWN_MODEL) {
     fprintf(stderr, "Unknown model type passed by -l argument.  Received l=%d.",
             llama_ver);
     error_usage();
   }
 
-  if (model_path == NULL) {
-    fprintf(stderr, "No model_path provided.");
-    error_usage();
-  }
-
   if (tokenizer_path == NULL) {
     fprintf(stderr, "No tokenizer_path provided.");
     error_usage();
@@ -901,8 +886,12 @@ int main(int argc, char *argv[]) {
     vocab_size = tokenizer->vocab_size();
   }
 
-  Transformer transformer;
-  build_transformer(&transformer, model_path, vocab_size, steps);
+  // read in the Config and the Weights from the model
+  // read_checkpoint(model_path, &t->config);
+  // allocate the RunState buffers
+  transformer.config.vocab_size = vocab_size;
+  transformer.config.seq_len = steps;
+  malloc_run_state(&transformer.state, &transformer.config);
 
   Sampler sampler;
   build_sampler(&sampler, vocab_size, temperature, topp, rng_seed);
diff --git a/torchchat/export.py b/torchchat/export.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-98e4dd524f2cb08414ee015b27616229cabc06ba`
	`1`	`+9c043290ad3944268290e015c3063bc411e6ef6b`