From 8ee137d87d8194047f939ae8f131b653e585ab5c Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Fri, 31 Jan 2025 16:04:37 -0800 Subject: [PATCH 1/3] Add DeepSeek R1 Distill 8B --- tokenizer/hf_tokenizer.py | 10 ++++++++-- torchchat/model_config/models.json | 6 ++++++ .../model_params/DeepSeek-R1-Distill-Llama-8B.json | 1 + 3 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py index d10ecb076..b77ee43ea 100644 --- a/tokenizer/hf_tokenizer.py +++ b/tokenizer/hf_tokenizer.py @@ -46,8 +46,14 @@ def __init__(self, file_path: str): if tokenizer_config_path is not None: with open(tokenizer_config_path, "r") as handle: tok_config = json.load(handle) - bos_token = tok_config.get("bos_token") - eos_token = tok_config.get("eos_token") + + def _extract_token(identifier: str) -> Optional[str]: + entry: Optional[Union[str, dict]] = tok_config.get(identifier) + return entry.get("content") if isinstance(entry, dict) else entry + + bos_token = _extract_token("bos_token") + eos_token = _extract_token("eos_token") + if bos_token is not None: self._bos_id = self._tokenizer.token_to_id(bos_token) if eos_token is not None: diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json index d2252e6dd..6b8d9c334 100644 --- a/torchchat/model_config/models.json +++ b/torchchat/model_config/models.json @@ -51,6 +51,12 @@ "distribution_path": "meta-llama/Meta-Llama-3.1-8B-Instruct", "transformer_params_key": "Meta-Llama-3.1-8B" }, + "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": { + "aliases": ["deepseek-r1-distill", "deepseek-r1-distill-8b"], + "distribution_channel": "HuggingFaceSnapshot", + "distribution_path": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "tokenizer_file": "tokenizer.json" + }, "meta-llama/Meta-Llama-3.1-70B-Instruct": { "aliases": ["llama3.1-70b"], "distribution_channel": "HuggingFaceSnapshot", diff --git a/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json new file mode 100644 index 000000000..b9fa79cd2 --- /dev/null +++ b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json @@ -0,0 +1 @@ +{"block_size": 131072, "dim": 4096, "ffn_dim_multiplier": 1.3, "multiple_of": 1024, "n_heads": 32, "n_local_heads": 8, "n_layers": 32, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true, "use_hf_tokenizer": true, "norm_eps": 1e-05, "rope_scaling": {"factor": 8.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, "original_max_position_embeddings": 8192}} From ab3fa3c29c63a239836b85a2338fcef3c987d9a4 Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Fri, 31 Jan 2025 16:24:47 -0800 Subject: [PATCH 2/3] Update aliases to match Ollama --- torchchat/model_config/models.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json index 6b8d9c334..3c2161b9b 100644 --- a/torchchat/model_config/models.json +++ b/torchchat/model_config/models.json @@ -52,7 +52,7 @@ "transformer_params_key": "Meta-Llama-3.1-8B" }, "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": { - "aliases": ["deepseek-r1-distill", "deepseek-r1-distill-8b"], + "aliases": ["deepseek-r1:8b"], "distribution_channel": "HuggingFaceSnapshot", "distribution_path": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "tokenizer_file": "tokenizer.json" From ea4f8952d7c908087f2582faa330715c382d9d2b Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Mon, 3 Feb 2025 09:30:42 -0800 Subject: [PATCH 3/3] Update README --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 04fb4789e..51db1bfca 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,11 @@ torchchat is a small codebase showcasing the ability to run large language models (LLMs) seamlessly. With torchchat, you can run LLMs using Python, within your own (C/C++) application (desktop or server) and on iOS and Android. > [!IMPORTANT] -> Update September 25, 2024: torchchat has multimodal support for **Llama3.2 11B**!! +> Update +> +> **February 3, 2025**: torchchat has support for [**DeepSeek R1 Distill: 8B**]( https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)! +> +> **September 25, 2024**: torchchat has multimodal support for **Llama3.2 11B**! > > To try it out, finish the [Installation](#Installation) section below, then hop > over to our [multimodal guide](docs/multimodal.md) to learn more. @@ -75,6 +79,7 @@ aliases. | [ibm-granite/granite-3.0-8b-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) |✅| Alias to `granite3-8b`.| | [ibm-granite/granite-3.1-2b-instruct](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct) |✅| Alias to `granite3.1-2b` and `granite3.1`.| | [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✅| Alias to `granite3.1-8b`.| +| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) |✅| Alias to `deepseek-r1:8b`.| ## Installation