Fix/custom dataset chat template (meta-llama#665)

init27 · web-flow · commit ee1768dc6958 · 2024-09-25T05:21:54.000-07:00
diff --git a/recipes/quickstart/finetuning/datasets/custom_dataset.py b/recipes/quickstart/finetuning/datasets/custom_dataset.py
@@ -9,19 +9,30 @@
 
 
 B_INST, E_INST = "[INST]", "[/INST]"
+EOT_ID = 128009 #<|eot_id|>
+
+def mask_target(target,seq):
+    for i in range(len(seq)-len(target)):
+        if seq[i:i+len(target)] == target:
+            seq[i:i+len(target)] = [-100] * len(target)
+    return seq
 
 def tokenize_dialog(dialog, tokenizer):
     if tokenizer.vocab_size >= 128000:
         dialog_tokens = tokenizer.apply_chat_template(dialog)
-        dialog_tokens = dialog_tokens[:-4] # Remove generation prompt <|start_header_id|>assistant<|end_header_id|>\n\n
-        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == 128009]
+        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == EOT_ID]
         labels = copy.copy(dialog_tokens)
-        last_idx = 0
+        #determine token for system and user 
+        system_or_user = (tokenizer.encode("system")[-1], tokenizer.encode("user")[-1])
+        labels[0] = -100 # bos token
+        last_idx = 1
         for n, idx in enumerate(eot_indices):
-            if n % 2 == 1:
-                last_idx = idx
-            else:
+            role_token = labels[last_idx+1]
+            if role_token in system_or_user:
+                # Set labels to -100 for system and user tokens to ignore in loss function
                 labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
+            last_idx = idx + 1
+        mask_target(tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>", add_special_tokens=False), labels)
 
         dialog_tokens = [dialog_tokens]
         labels_tokens = [labels]
diff --git a/src/tests/conftest.py b/src/tests/conftest.py
@@ -6,7 +6,7 @@
 from transformers import AutoTokenizer
 
 ACCESS_ERROR_MSG = "Could not access tokenizer at 'meta-llama/Llama-2-7b-hf'. Did you log into huggingface hub and provided the correct token?"
-LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B"]
+LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B-Instruct"]
 
 @pytest.fixture(params=LLAMA_VERSIONS)
 def llama_version(request):
diff --git a/src/tests/datasets/test_custom_dataset.py b/src/tests/datasets/test_custom_dataset.py
@@ -11,7 +11,7 @@
         "example_1": "[INST] Who made Berlin [/INST] dunno",
         "example_2": "[INST] Quiero preparar una pizza de pepperoni, puedes darme los pasos para hacerla? [/INST] Claro!",
     },
-    "meta-llama/Meta-Llama-3.1-8B":{
+    "meta-llama/Meta-Llama-3.1-8B-Instruct":{
         "example_1": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWho made Berlin<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ndunno<|eot_id|><|end_of_text|>",
         "example_2": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow to start learning guitar and become a master at it?",
     },
@@ -114,3 +114,30 @@ def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train,
         }
     with pytest.raises(AttributeError):
         main(**kwargs)
+
+@pytest.mark.skip_missing_tokenizer
+@patch('llama_recipes.finetuning.AutoTokenizer')
+def test_tokenize_dialog(tokenizer, monkeypatch, setup_tokenizer, llama_version):
+    monkeypatch.syspath_prepend("recipes/quickstart/finetuning/datasets/")
+    from custom_dataset import tokenize_dialog
+
+    setup_tokenizer(tokenizer)
+    tokenizer = tokenizer.from_pretrained()
+
+    dialog = [
+        {"role":"user", "content":"Who made Berlin?"},
+        {"role":"assistant", "content":"dunno"},
+        {"role":"user", "content":"And Rome?"},
+        {"role":"assistant", "content":"Romans"},
+    ]
+
+    result = tokenize_dialog(dialog, tokenizer)
+    
+    if "Llama-2" in llama_version:
+        assert result["labels"][:12] == [-100] * 12
+        assert result["labels"][17:28] == [-100] * 11
+        assert result["labels"].count(-100) == 11 + 12
+    else:
+        assert result["labels"][:38] == [-100] * 38
+        assert result["labels"][43:54] == [-100] * 11
+        assert result["labels"].count(-100) == 38 + 11