Add special tokens for models

rlouf · rlouf · commit 148c2741eb23 · 2024-09-27T11:36:00.000+02:00
We should download the relevant files from HF. I don't think we can
avoid implementing the Jinja2 templates for each model family though.
Would need to use regular expressions instead of full names (might be slow).
diff --git a/prompts/tokens.py b/prompts/tokens.py
@@ -0,0 +1,29 @@
+from dataclasses import dataclass
+from typing import Dict, Optional
+
+
+@dataclass
+class Limits:
+    begin: str = ""
+    end: str = ""
+
+
+@dataclass
+class Special:
+    sequence: Limits = Limits("", "")
+    user: Limits = Limits("", "")
+    assistant: Limits = Limits("", "")
+    system: Limits = Limits("", "")
+
+
+SPECIAL_TOKENS: Dict[Optional[str], Special] = {
+    None: Special(),
+    "google/gemma-2-9b": Special(Limits("<bos>", "<eos>")),
+    "openai-community/gpt2": Special(Limits("", "<|endoftext|>")),
+    "mistralai/Mistral-7B-v0.1": Special(Limits("<s>", "</s>")),
+    "mistralai/Mistral-7B-Instruct-v0.1": Special(
+        Limits("<s>", "</s>"),
+        Limits("[INST]", "[/INST]"),
+        Limits("", "</s>"),
+    ),
+}