Merge branch 'main' into newtemplate

mikekgfb · web-flow · commit ee35cd416388 · 2025-02-06T11:05:51.000-08:00
diff --git a/.github/workflows/more-tests.yml b/.github/workflows/more-tests.yml
@@ -19,6 +19,7 @@ jobs:
       gpu-arch-version: "12.4"
       timeout: 60
       script: |
+        set -xeou pipefail
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
@@ -39,9 +40,10 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Run inference"
-        export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+        export MODEL_DIR=checkpoints/stories15M/
+        export MODEL_PATH=${MODEL_DIR}/stories15M.pt
         export MODEL_NAME=stories15M
-        export MODEL_DIR=/tmp
+
 
         for DTYPE in bfloat16 float16 float32; do
           ###################################################################
@@ -83,3 +85,66 @@ jobs:
         echo "tests complete"
         echo "******************************************"
         echo "::endgroup::"
+
+
+  test-sdpa-backends-export:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.4"
+      timeout: 60
+      script: |
+        set -xeou pipefail
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoints"
+        # Install requirements
+        ./install/install_requirements.sh cuda
+        pip3 list
+        python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoints"
+        mkdir -p checkpoints/stories15M
+        pushd checkpoints/stories15M
+        wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+        wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Run inference"
+        export MODEL_DIR=checkpoints/stories15M/
+        export MODEL_PATH=${MODEL_DIR}/stories15M.pt
+        export MODEL_NAME=stories15M
+
+        ./torchchat/utils/scripts/build_native.sh aoti
+        
+        for DEVICE in cpu cuda; do
+          # depending on how the parameter passing works, may only be able to do bfloat16 for aoti_run, similar to runner-cuda-dtype.yml
+          # (although the runner environment should not have an opinion what we us in the artifact, and we might suitably abstract that)
+          for DTYPE in bfloat16 float16 float32; do
+            for SDPA in 'math' 'flash_attention' 'efficient_attention' 'cudnn_attention'; do
+              echo "***************************************************************"
+              echo "*** $DEVICE $DTYPE $SDPA"
+              ###################################################################
+              # Export DSO and run with Python
+              python torchchat.py export --output-dso dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} 
+              python torchchat.py generate --dso-path dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} --temperature 0 --prompt "Once upon a time"
+              ###################################################################
+              # Export AOTI and run with aoti_run
+              python torchchat.py export --output-aoti /tmp/model.pt2 --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} 
+              ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "Once upon a time"
+              ###################################################################
+            done
+          done
+        done
+
+        echo "tests complete"
+        echo "******************************************"
+        echo "::endgroup::"
diff --git a/README.md b/README.md
@@ -3,7 +3,11 @@
 torchchat is a small codebase showcasing the ability to run large language models (LLMs) seamlessly. With torchchat, you can run LLMs using Python, within your own (C/C++) application (desktop or server) and on iOS and Android.
 
 > [!IMPORTANT]
-> Update September 25, 2024: torchchat has multimodal support for **Llama3.2 11B**!!
+> Update
+>
+> **February 3, 2025**: torchchat has support for [**DeepSeek R1 Distill: 8B**]( https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)!
+>
+> **September 25, 2024**: torchchat has multimodal support for **Llama3.2 11B**!
 >
 > To try it out, finish the [Installation](#Installation) section below, then hop
 > over to our [multimodal guide](docs/multimodal.md) to learn more.
@@ -75,6 +79,7 @@ aliases.
 | [ibm-granite/granite-3.0-8b-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) |✅| Alias to `granite3-8b`.|
 | [ibm-granite/granite-3.1-2b-instruct](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct) |✅| Alias to `granite3.1-2b` and `granite3.1`.|
 | [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✅| Alias to `granite3.1-8b`.|
+| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) |✅| Alias to `deepseek-r1:8b`.|
 
 
 ## Installation
diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py
@@ -46,8 +46,14 @@ def __init__(self, file_path: str):
         if tokenizer_config_path is not None:
             with open(tokenizer_config_path, "r") as handle:
                 tok_config = json.load(handle)
-            bos_token = tok_config.get("bos_token")
-            eos_token = tok_config.get("eos_token")
+
+            def _extract_token(identifier: str) -> Optional[str]:
+                entry: Optional[Union[str, dict]] = tok_config.get(identifier)
+                return entry.get("content") if isinstance(entry, dict) else entry
+
+            bos_token = _extract_token("bos_token")
+            eos_token = _extract_token("eos_token")
+
             if bos_token is not None:
                 self._bos_id = self._tokenizer.token_to_id(bos_token)
             if eos_token is not None:
diff --git a/torchchat/export.py b/torchchat/export.py
@@ -490,13 +490,14 @@ def main(args):
             print(
                 "WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead."
             )
-            export_for_server(
-                model_to_dso,
-                builder_args.device,
-                output_dso_path,
-                builder_args.dynamic_shapes,
-                package=False,
-            )
+            with torch.nn.attention.sdpa_kernel([builder_args.attention_backend]):
+                export_for_server(
+                    model_to_dso,
+                    builder_args.device,
+                    output_dso_path,
+                    builder_args.dynamic_shapes,
+                    package=False,
+                )
 
         if output_aoti_package_path:
             output_aoti_package_path = str(os.path.abspath(output_aoti_package_path))
@@ -512,14 +513,15 @@ def main(args):
             print(
                 "Exporting model using AOT Inductor to " f"{output_aoti_package_path}."
             )
-            export_for_server(
-                model_to_aoti_package,
-                builder_args.device,
-                output_aoti_package_path,
-                builder_args.dynamic_shapes,
-                package=True,
-                metadata=metadata,
-            )
+            with torch.nn.attention.sdpa_kernel([builder_args.attention_backend]):
+                export_for_server(
+                    model_to_aoti_package,
+                    builder_args.device,
+                    output_aoti_package_path,
+                    builder_args.dynamic_shapes,
+                    package=True,
+                    metadata=metadata,
+                )
 
         if output_snapshot_path:
             output_snapshot_path = str(os.path.abspath(output_snapshot_path))
@@ -529,4 +531,3 @@ def main(args):
                 builder_args.device,
                 output_snapshot_path,
             )
-
diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json
@@ -51,6 +51,12 @@
         "distribution_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "transformer_params_key": "Meta-Llama-3.1-8B"
     },
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": {
+        "aliases": ["deepseek-r1:8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "tokenizer_file": "tokenizer.json"
+    },
     "meta-llama/Meta-Llama-3.1-70B-Instruct": {
         "aliases": ["llama3.1-70b"],
         "distribution_channel": "HuggingFaceSnapshot",
diff --git a/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json
@@ -0,0 +1 @@
+{"block_size": 131072, "dim": 4096, "ffn_dim_multiplier": 1.3, "multiple_of": 1024, "n_heads": 32, "n_local_heads": 8, "n_layers": 32, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true, "use_hf_tokenizer": true, "norm_eps": 1e-05, "rope_scaling": {"factor": 8.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, "original_max_position_embeddings": 8192}}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"block_size": 131072, "dim": 4096, "ffn_dim_multiplier": 1.3, "multiple_of": 1024, "n_heads": 32, "n_local_heads": 8, "n_layers": 32, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true, "use_hf_tokenizer": true, "norm_eps": 1e-05, "rope_scaling": {"factor": 8.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, "original_max_position_embeddings": 8192}}`