diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index 71f074cef..4e5881c42 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -8,13 +8,16 @@ fi
 
 # Pre-initialize variables
 filepath=""
-parameters="--replace 'llama3:stories15M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN"
+# cuda supports padding, so no need to replace quantization for now.  
+# otherwise add: 'cuda.json:cuda-32.json' to replace rules
+parameters="--replace llama3:stories15M,-l3:-l2,mobile.json:mobile-32.json --suppress huggingface-cli,HF_TOKEN"
 script_name="./run-${1}.sh"  # Dynamically initialize script name
 
 # Use a case statement to handle the $1 argument
 case "$1" in
   "readme")
     filepath="README.md"
+    parameters="--replace llama3.1:stories15M,-l3:-l2,mobile.json:mobile-32.json --suppress huggingface-cli,HF_TOKEN"
     ;;
   "quantization")
     filepath="docs/quantization.md"
@@ -63,5 +66,6 @@ echo "::group::Run $1"
 echo "*******************************************"
 cat "$script_name"
 echo "*******************************************"
-bash -x "$script_name"
+set -x
+. "$script_name"
 echo "::endgroup::"
diff --git a/.github/workflows/run-readme-pr-linuxaarch64.yml b/.github/workflows/run-readme-pr-linuxaarch64.yml
index 1f22c4f2e..e765e1993 100644
--- a/.github/workflows/run-readme-pr-linuxaarch64.yml
+++ b/.github/workflows/run-readme-pr-linuxaarch64.yml
@@ -23,6 +23,9 @@ jobs:
         uname -a
         echo "::endgroup::"
 
+        which pip || true
+        which pip3 || true
+        which conda || true
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
 
         echo "::group::Completion"
@@ -44,7 +47,11 @@ jobs:
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
-
+   
+        which pip || true
+        which pip3 || true
+        which conda || true
+        
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
 
   test-gguf-cpu:
@@ -62,6 +69,10 @@ jobs:
         uname -a
         echo "::endgroup::"
 
+        which pip || true
+        which pip3 || true
+        which conda || true
+        
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
 
         echo "::group::Completion"
@@ -84,6 +95,10 @@ jobs:
         uname -a
         echo "::endgroup::"
 
+        which pip || true
+        which pip3 || true
+        which conda || true
+        
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
 
         echo "::group::Completion"
@@ -106,6 +121,10 @@ jobs:
         uname -a
         echo "::endgroup::"
 
+        which pip || true
+        which pip3 || true
+        which conda || true
+        
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
 
         echo "::group::Completion"
diff --git a/.github/workflows/run-readme-pr-macos.yml b/.github/workflows/run-readme-pr-macos.yml
index ce84d3b50..114d0a569 100644
--- a/.github/workflows/run-readme-pr-macos.yml
+++ b/.github/workflows/run-readme-pr-macos.yml
@@ -33,8 +33,13 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
+          which pip || true
+          which pip3 || true
+          which conda || true
+
           echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
-          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
+          export TORCHCHAT_DEVICE=cpu 
+          . .ci/scripts/run-docs readme
   
           echo "::group::Completion"
           echo "tests complete"
@@ -70,8 +75,9 @@ jobs:
           echo "::endgroup::"
 
           echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
-          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
-  
+          export TORCHCHAT_DEVICE=cpu 
+          . .ci/scripts/run-docs quantization
+
           echo "::group::Completion"
           echo "tests complete"
           echo "*******************************************"
@@ -106,7 +112,8 @@ jobs:
           echo "::endgroup::"
 
           echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
-          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
+          export TORCHCHAT_DEVICE=cpu 
+          # .ci/scripts/run-docs gguf
 
           echo "::group::Completion"
           echo "tests complete"
@@ -141,7 +148,8 @@ jobs:
           echo "::endgroup::"
 
           echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
-          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
+          export TORCHCHAT_DEVICE=cpu 
+          . .ci/scripts/run-docs advanced
 
           echo "::group::Completion"
           echo "tests complete"
@@ -209,7 +217,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs multimodal
+          # metadata does not install properly on macos
+          # .ci/scripts/run-docs multimodal
 
           echo "::group::Completion"
           echo "tests complete"
@@ -243,7 +252,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs native
+          echo ".ci/scripts/run-docs native DISABLED"
+          # .ci/scripts/run-docs native
 
           echo "::group::Completion"
           echo "tests complete"
diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
index db16bc80e..80b836e2b 100644
--- a/.github/workflows/run-readme-pr-mps.yml
+++ b/.github/workflows/run-readme-pr-mps.yml
@@ -81,7 +81,7 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs gguf
+          # .ci/scripts/run-docs gguf
 
           echo "::group::Completion"
           echo "tests complete"
@@ -162,7 +162,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs multimodal
+          # metadata does not install properly on macos
+          # .ci/scripts/run-docs multimodal
 
           echo "::group::Completion"
           echo "tests complete"
@@ -189,7 +190,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs native
+          echo ".ci/scripts/run-docs native DISABLED"
+          # .ci/scripts/run-docs native
 
           echo "::group::Completion"
           echo "tests complete"
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
index 37c27822b..99c098f66 100644
--- a/.github/workflows/run-readme-pr.yml
+++ b/.github/workflows/run-readme-pr.yml
@@ -1,4 +1,4 @@
-name: Run the README instructions - with stories
+name: Run the README instructions - with stories and new template
 
 on:
   pull_request:
@@ -9,322 +9,244 @@ on:
 
 jobs:
   test-readme-any:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        .ci/scripts/run-docs readme
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
+    strategy:
+      matrix:
+        runner: [linux.g5.4xlarge.nvidia.gpu]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - uses: actions/checkout@v3 # Updated to v3
+      - uses: actions/setup-python@v4 # Updated to v4
+      - uses: maxim-lobanov/setup-xcode@v1 # This action is fine
+        if: runner.os == 'macOS'
+        with:
+          xcode-version: '15.3' # Or latest stable if possible
+      - run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - run: |
+          echo "Summary pip3 packages"
+          export TORCHCHAT_ROOT=$PWD
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+          python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
+      - run: |
+          echo "::group::Print machine info and try install pip and/or pip3"
+          set -x
+          which pip || true
+          which pip3 || true
+          which conda || true
+          uname -a
+          echo "::endgroup::"
+
+          . .ci/scripts/run-docs readme
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
 
   test-readme-cpu:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
+    strategy:
+      matrix:
+        runner: [linux.g5.4xlarge.nvidia.gpu]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - uses: actions/checkout@v3 # Updated to v3
+      - uses: actions/setup-python@v4 # Updated to v4
+      - uses: maxim-lobanov/setup-xcode@v1 # This action is fine
+        if: runner.os == 'macOS'
+        with:
+          xcode-version: '15.3' # Or latest stable if possible
+      - run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - run: |
+          echo "Summary pip3 packages"
+          export TORCHCHAT_ROOT=$PWD
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+          python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
+      - run: |
+          echo "::group::Print machine info and try install pip and/or pip3"
+          set -x
+          which pip || true
+          which pip3 || true
+          which conda || true
+          uname -a
+          echo "::endgroup::"
+
+          export TORCHCHAT_DEVICE=cpu 
+          . .ci/scripts/run-docs readme
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
 
   test-quantization-any:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        .ci/scripts/run-docs quantization
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
+    strategy:
+      matrix:
+        runner: [linux.g5.4xlarge.nvidia.gpu]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - uses: actions/checkout@v3 # Updated to v3
+      - uses: actions/setup-python@v4 # Updated to v4
+      - uses: maxim-lobanov/setup-xcode@v1 # This action is fine
+        if: runner.os == 'macOS'
+        with:
+          xcode-version: '15.3' # Or latest stable if possible
+      - run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - run: |
+          echo "Summary pip3 packages"
+          export TORCHCHAT_ROOT=$PWD
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+          python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
+      - run: |
+          echo "::group::Print machine info and try install pip and/or pip3"
+          set -x
+          which pip || true
+          which pip3 || true
+          which conda || true
+          uname -a
+          echo "::endgroup::"
+
+          . .ci/scripts/run-docs quantization
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
 
   test-quantization-cpu:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
-
-  test-gguf-any:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        .ci/scripts/run-docs gguf
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
-
-  test-gguf-cpu:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
-
+    strategy:
+      matrix:
+        runner: [linux.g5.4xlarge.nvidia.gpu]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - uses: actions/checkout@v3 # Updated to v3
+      - uses: actions/setup-python@v4 # Updated to v4
+      - uses: maxim-lobanov/setup-xcode@v1 # This action is fine
+        if: runner.os == 'macOS'
+        with:
+          xcode-version: '15.3' # Or latest stable if possible
+      - run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - run: |
+          echo "Summary pip3 packages"
+          export TORCHCHAT_ROOT=$PWD
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+          python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
+      - run: |
+          echo "::group::Print machine info and try install pip and/or pip3"
+          set -x
+          which pip || true
+          which pip3 || true
+          which conda || true
+          uname -a
+          echo "::endgroup::"
+
+          export TORCHCHAT_DEVICE=cpu 
+          . .ci/scripts/run-docs quantization
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
 
   test-advanced-any:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        .ci/scripts/run-docs advanced
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
-
+    strategy:
+      matrix:
+        runner: [linux.g5.4xlarge.nvidia.gpu]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - uses: actions/checkout@v3 # Updated to v3
+      - uses: actions/setup-python@v4 # Updated to v4
+      - uses: maxim-lobanov/setup-xcode@v1 # This action is fine
+        if: runner.os == 'macOS'
+        with:
+          xcode-version: '15.3' # Or latest stable if possible
+      - run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - run: |
+          echo "Summary pip3 packages"
+          export TORCHCHAT_ROOT=$PWD
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+          python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
+      - run: |
+          echo "::group::Print machine info and try install pip and/or pip3"
+          set -x
+          which pip || true
+          which pip3 || true
+          which conda || true
+          uname -a
+          echo "::endgroup::"
+
+          . .ci/scripts/run-docs advanced
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
 
   test-advanced-cpu:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
-
-  test-evaluation-any:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        .ci/scripts/run-docs evaluation
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
-
-  test-evaluation-cpu:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
-
-  test-multimodal-any:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        .ci/scripts/run-docs multimodal
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
-
-  test-multimodal-cpu:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs multimodal
-
-  test-native-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
-        .ci/scripts/run-docs native
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
-
-  test-native-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
-        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native
-
-  test-distributed-cuda:
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.4"
-      timeout: 60
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        echo "::endgroup::"
-
-        .ci/scripts/run-docs distributed
-
-        echo "::group::Completion"
-        echo "tests complete"
-        echo "*******************************************"
-        echo "::endgroup::"
+    strategy:
+      matrix:
+        runner: [linux.g5.4xlarge.nvidia.gpu]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - uses: actions/checkout@v3 # Updated to v3
+      - uses: actions/setup-python@v4 # Updated to v4
+      - uses: maxim-lobanov/setup-xcode@v1 # This action is fine
+        if: runner.os == 'macOS'
+        with:
+          xcode-version: '15.3' # Or latest stable if possible
+      - run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - run: |
+          echo "Summary pip3 packages"
+          export TORCHCHAT_ROOT=$PWD
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+          python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
+      - run: |
+          echo "::group::Print machine info and try install pip and/or pip3"
+          set -x
+          which pip || true
+          which pip3 || true
+          which conda || true
+          uname -a
+          echo "::endgroup::"
+
+          export TORCHCHAT_DEVICE=cpu 
+          . .ci/scripts/run-docs advanced
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
diff --git a/README.md b/README.md
index 51db1bfca..a6cec65b1 100644
--- a/README.md
+++ b/README.md
@@ -95,10 +95,11 @@ cd torchchat
 python3 -m venv .venv
 source .venv/bin/activate
 ./install/install_requirements.sh
+mkdir exportedModels
 ```
 [skip default]: end
 
-[shell default]: ./install/install_requirements.sh
+[shell default]: mkdir exportedModels; ./install/install_requirements.sh
 
 ## Commands
 
@@ -243,7 +244,9 @@ python3 torchchat.py server llama3.1
 ```
 [skip default]: end
 
+<!==
 [shell default]: python3 torchchat.py server llama3.1 & server_pid=$! ; sleep 90 # wait for server to be ready to accept requests
+-->
 
 In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond.
 
@@ -284,7 +287,9 @@ curl http://127.0.0.1:5000/v1/chat/completions \
 
 [skip default]: end
 
+<!--
 [shell default]: kill ${server_pid}
+-->
 
 </details>
 
diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md
index 17958e790..9e006acf2 100644
--- a/docs/ADVANCED-USERS.md
+++ b/docs/ADVANCED-USERS.md
@@ -177,6 +177,8 @@ preparatory step:
 You can set these variables as follows for the exemplary model15M
 model from Andrej Karpathy's tinyllamas model family:
 
+[shell default]: pip install wget
+
 ```
 MODEL_NAME=stories15M
 MODEL_DIR=~/checkpoints/${MODEL_NAME}
@@ -185,6 +187,16 @@ MODEL_OUT=~/torchchat-exports
 
 mkdir -p ${MODEL_DIR}
 mkdir -p ${MODEL_OUT}
+
+# Change to the MODELDIR directory
+pushd ${MODEL_DIR}
+
+# Download the files for stories15M using wget
+wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+wget https://github.com/karpathy/llama2.c/raw/refs/heads/master/tokenizer.model
+
+# Go back to the original directory
+popd
 ```
 
 When we export models with AOT Inductor for servers and desktops, and
@@ -335,7 +347,7 @@ tests against the exported model with the same interface, and support
 additional experiments to confirm model quality and speed.
 
 ```
-python3 torchchat.py generate --device [ cuda | cpu ] --dso-path ${MODEL_NAME}.so --prompt "Once upon a time"
+python3 torchchat.py generate --device [ cuda | cpu ] --checkpoint-path ${MODEL_PATH} --dso-path ${MODEL_NAME}.so --prompt "Once upon a time"
 ```
 
 
diff --git a/docs/multimodal.md b/docs/multimodal.md
index cd249a1fb..975cdbd25 100644
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -111,3 +111,5 @@ One of the goals of torchchat is to support various execution modes for every mo
 - **[ExecuTorch](https://github.com/pytorch/executorch)**: On-device (Edge) inference
 
 In addition, we are in the process of integrating with [lm_evaluation_harness](https://github.com/EleutherAI/lm-evaluation-harness) for multimodal model evaluation.
+
+[end default]: end
diff --git a/docs/native-execution.md b/docs/native-execution.md
index c22d3c3ba..dc0c799b1 100644
--- a/docs/native-execution.md
+++ b/docs/native-execution.md
@@ -83,6 +83,7 @@ python3 torchchat.py export stories15M --output-dso-path ./model.so
 We can now execute the runner with:
 
 [shell default]: pip install wget
+
 ```
 curl -OL https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
 ./cmake-out/aoti_run ./model.so -z ./tokenizer.model -l 2 -i "Once upon a time"
@@ -109,7 +110,7 @@ installed ExecuTorch, running the commands below will build the
 runner, without re-installing ExecuTorch from source:
 
 ```
-# Pull submodules (re2, abseil) for Tiktoken
+# Pull submodules re2 and abseil for Tiktoken
 git submodule sync
 git submodule update --init
 
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 360ba1801..9736bb7d6 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -19,6 +19,7 @@ then
   fi
 fi
 echo "Using python executable: $PYTHON_EXECUTABLE"
+echo "located at $(which $PYTHON_EXECUTABLE || echo not found)"
 
 PYTHON_SYS_VERSION="$($PYTHON_EXECUTABLE -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")"
 # Check python version. Expect at least 3.10.x
@@ -43,6 +44,14 @@ else
 fi
 
 echo "Using pip executable: $PIP_EXECUTABLE"
+echo "located at $(which $PIP_EXECUTABLE || echo not found)"
+
+echo
+echo "possible pip candidates are:"
+echo "pip is located at $(which pip || echo not found)"
+echo "pip3 is located at $(which pip3 || echo not found)"
+echo "pip{PYTHON_SYS_VERSION} is located at $(which pip{PYTHON_SYS_VERSION} || echo not found)"
+echo
 
 # Since torchchat often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features. The PYTORCH_NIGHTLY_VERSION value should