Created tutorial on how to add our reasoning tokens

jamesbraza · jamesbraza · commit 3ba5b2eccc8f · 2025-06-04T16:05:00.000-07:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -30,8 +30,11 @@ repos:
       - id: debug-statements
       - id: detect-private-key
       - id: end-of-file-fixer
+        exclude_types: [jinja]
       - id: mixed-line-ending
+        exclude_types: [jinja]
       - id: trailing-whitespace
+        exclude_types: [jinja]
   - repo: https://github.com/pappasam/toml-sort
     rev: v0.24.2
     hooks:
diff --git a/docs/adding_tokens.ipynb b/docs/adding_tokens.ipynb
@@ -0,0 +1,129 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "072120f9",
+   "metadata": {},
+   "source": [
+    "If you would like to modify a base model to add our custom reasoning tokens,\n",
+    "here's how to do it.\n",
+    "\n",
+    "Firstly, please install the `add-tokens` extra via\n",
+    "`pip install ether0[add-tokens]` for the `transformers` package.\n",
+    "\n",
+    "Then, configure the following inputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2fb6296",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model name/revisions for Hugging Face Hub\n",
+    "input_model_name = \"mistralai/Mistral-Small-24B-Instruct-2501\"\n",
+    "input_model_revision: str | None = None\n",
+    "output_model_name = \"FILL ME IN\"\n",
+    "output_model_revision: str | None = None\n",
+    "output_model_is_private = True\n",
+    "tokenizer_only = False  # Set True to only update the tokenizer\n",
+    "push_to_hf = False  # Set True to push to Hugging Face Hub\n",
+    "\n",
+    "# Chat template file that uses the new tokens\n",
+    "chat_template_path = \"updated_mistral_chat_template.jinja\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99927d80",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8e15d3fb5e864e1286cf94fc588e504d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`\n",
+      "The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "\n",
+    "from ether0.model_prompts import ANSWER_END, ANSWER_START, THINK_END, THINK_START\n",
+    "\n",
+    "REASONING_TOKENS_TO_ADD = [\n",
+    "    THINK_START,\n",
+    "    THINK_END,\n",
+    "    ANSWER_START,\n",
+    "    ANSWER_END,\n",
+    "]\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    input_model_name, revision=input_model_revision\n",
+    ")\n",
+    "# NOTE: reasoning tokens are normal (not special) tokens so they aren't\n",
+    "# removed when passing skip_special_tokens=True to a tokenizer\n",
+    "tokenizer.add_tokens(REASONING_TOKENS_TO_ADD)\n",
+    "tokenizer.chat_template = Path(chat_template_path).read_text(encoding=\"utf-8\")\n",
+    "if push_to_hf:\n",
+    "    tokenizer.push_to_hub(\n",
+    "        output_model_name,\n",
+    "        revision=output_model_revision,\n",
+    "        private=output_model_is_private,\n",
+    "    )\n",
+    "\n",
+    "if not tokenizer_only:\n",
+    "    model = AutoModelForCausalLM.from_pretrained(\n",
+    "        input_model_name, revision=input_model_revision\n",
+    "    )\n",
+    "    # SEE: https://www.thonking.ai/p/what-shapes-do-matrix-multiplications\n",
+    "    model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)\n",
+    "    if push_to_hf:\n",
+    "        model.push_to_hub(\n",
+    "            output_model_name,\n",
+    "            revision=output_model_revision,\n",
+    "            private=output_model_is_private,\n",
+    "        )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/updated_mistral_chat_template.jinja b/docs/updated_mistral_chat_template.jinja
@@ -0,0 +1,22 @@
+{%- set default_system_message = "You are a scientific reasoning AI assistant." %}
+{{- bos_token }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = default_system_message %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}
+
+{%- for message in loop_messages %}
+    {%- if message['role'] == 'user' %}
+        {{- '[INST]' + message['content'] + '[/INST]' }}
+    {%- elif message['role'] == 'system' %}
+        {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- message['content'] + eos_token }}
+    {%- else %}
+        {{- raise_exception("Only user, system and assistant roles are supported!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,8 +44,13 @@ readme = "README.md"
 requires-python = ">=3.11"
 
 [project.optional-dependencies]
+add-tokens = [
+    "ipykernel",  # For Jupyter notebook support
+    "ipywidgets>=8",  # For Jupyter notebook support, and pin to keep recent
+    "transformers>=4.49",  # Pin to keep recent
+]
 dev = [
-    "ether0[typing]",
+    "ether0[add-tokens,typing]",
     "huggingface-hub[cli]",  # For login inside of CI
     "ipython>=8",  # Pin to keep recent
     "mypy>=1.8",  # For addition of mutable-override
diff --git a/uv.lock b/uv.lock