Skip to content

Commit 0b1228f

Browse files
Merge branch 'main' of github.com:meta-llama/llama-recipes into main
.github/scripts/spellcheck_conf/wordlist.txt Changes to be committed: modified: .github/scripts/spellcheck_conf/wordlist.txt modified: README.md modified: pyproject.toml modified: recipes/quickstart/finetuning/datasets/custom_dataset.py modified: recipes/quickstart/inference/local_inference/inference.py modified: requirements.txt modified: src/tests/conftest.py modified: src/tests/datasets/test_custom_dataset.py
2 parents 460bfcc + ee1768d commit 0b1228f

File tree

8 files changed

+57
-13
lines changed

8 files changed

+57
-13
lines changed

.github/scripts/spellcheck_conf/wordlist.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,3 +1454,4 @@ acc
14541454
OCRVQA
14551455
OCRVQADataCollator
14561456
ocrvqa
1457+
langchain

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ To use the sensitive topics safety checker install with:
7676
```
7777
pip install llama-recipes[auditnlg]
7878
```
79+
Some recipes require the presence of langchain. To install the packages follow the recipe description or install with:
80+
```
81+
pip install llama-recipes[langchain]
82+
```
7983
Optional dependencies can also be combines with [option1,option2].
8084

8185
#### Install from source

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ dynamic = ["dependencies"]
2424
vllm = ["vllm"]
2525
tests = ["pytest-mock"]
2626
auditnlg = ["auditnlg"]
27+
langchain = ["langchain_openai", "langchain", "langchain_community"]
2728

2829
[project.urls]
2930
"Homepage" = "https://github.com/facebookresearch/llama-recipes/"

recipes/quickstart/finetuning/datasets/custom_dataset.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,30 @@
99

1010

1111
B_INST, E_INST = "[INST]", "[/INST]"
12+
EOT_ID = 128009 #<|eot_id|>
13+
14+
def mask_target(target,seq):
15+
for i in range(len(seq)-len(target)):
16+
if seq[i:i+len(target)] == target:
17+
seq[i:i+len(target)] = [-100] * len(target)
18+
return seq
1219

1320
def tokenize_dialog(dialog, tokenizer):
1421
if tokenizer.vocab_size >= 128000:
1522
dialog_tokens = tokenizer.apply_chat_template(dialog)
16-
dialog_tokens = dialog_tokens[:-4] # Remove generation prompt <|start_header_id|>assistant<|end_header_id|>\n\n
17-
eot_indices = [i for i,n in enumerate(dialog_tokens) if n == 128009]
23+
eot_indices = [i for i,n in enumerate(dialog_tokens) if n == EOT_ID]
1824
labels = copy.copy(dialog_tokens)
19-
last_idx = 0
25+
#determine token for system and user
26+
system_or_user = (tokenizer.encode("system")[-1], tokenizer.encode("user")[-1])
27+
labels[0] = -100 # bos token
28+
last_idx = 1
2029
for n, idx in enumerate(eot_indices):
21-
if n % 2 == 1:
22-
last_idx = idx
23-
else:
30+
role_token = labels[last_idx+1]
31+
if role_token in system_or_user:
32+
# Set labels to -100 for system and user tokens to ignore in loss function
2433
labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
34+
last_idx = idx + 1
35+
mask_target(tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>", add_special_tokens=False), labels)
2536

2637
dialog_tokens = [dialog_tokens]
2738
labels_tokens = [labels]

recipes/quickstart/inference/local_inference/inference.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import time
77

88
import fire
9-
import gradio as gr
109

1110
import torch
1211

@@ -146,6 +145,11 @@ def inference(
146145
user_prompt = "\n".join(sys.stdin.readlines())
147146
inference(user_prompt, temperature, top_p, top_k, max_new_tokens)
148147
else:
148+
try:
149+
import gradio as gr
150+
except ImportError:
151+
raise ImportError("This part of the recipe requires gradio. Please run `pip install gradio`")
152+
149153
gr.Interface(
150154
fn=inference,
151155
inputs=[

requirements.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ py7zr
1414
scipy
1515
optimum
1616
matplotlib
17-
gradio
1817
chardet
1918
openai
2019
typing-extensions==4.8.0
@@ -24,8 +23,5 @@ rouge_score
2423
pyyaml==6.0.1
2524
faiss-gpu; python_version < '3.11'
2625
unstructured[pdf]
27-
langchain_openai
28-
langchain
29-
langchain_community
3026
sentence_transformers
3127
codeshield

src/tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from transformers import AutoTokenizer
77

88
ACCESS_ERROR_MSG = "Could not access tokenizer at 'meta-llama/Llama-2-7b-hf'. Did you log into huggingface hub and provided the correct token?"
9-
LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B"]
9+
LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B-Instruct"]
1010

1111
@pytest.fixture(params=LLAMA_VERSIONS)
1212
def llama_version(request):

src/tests/datasets/test_custom_dataset.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"example_1": "[INST] Who made Berlin [/INST] dunno",
1212
"example_2": "[INST] Quiero preparar una pizza de pepperoni, puedes darme los pasos para hacerla? [/INST] Claro!",
1313
},
14-
"meta-llama/Meta-Llama-3.1-8B":{
14+
"meta-llama/Meta-Llama-3.1-8B-Instruct":{
1515
"example_1": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWho made Berlin<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ndunno<|eot_id|><|end_of_text|>",
1616
"example_2": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow to start learning guitar and become a master at it?",
1717
},
@@ -114,3 +114,30 @@ def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train,
114114
}
115115
with pytest.raises(AttributeError):
116116
main(**kwargs)
117+
118+
@pytest.mark.skip_missing_tokenizer
119+
@patch('llama_recipes.finetuning.AutoTokenizer')
120+
def test_tokenize_dialog(tokenizer, monkeypatch, setup_tokenizer, llama_version):
121+
monkeypatch.syspath_prepend("recipes/quickstart/finetuning/datasets/")
122+
from custom_dataset import tokenize_dialog
123+
124+
setup_tokenizer(tokenizer)
125+
tokenizer = tokenizer.from_pretrained()
126+
127+
dialog = [
128+
{"role":"user", "content":"Who made Berlin?"},
129+
{"role":"assistant", "content":"dunno"},
130+
{"role":"user", "content":"And Rome?"},
131+
{"role":"assistant", "content":"Romans"},
132+
]
133+
134+
result = tokenize_dialog(dialog, tokenizer)
135+
136+
if "Llama-2" in llama_version:
137+
assert result["labels"][:12] == [-100] * 12
138+
assert result["labels"][17:28] == [-100] * 11
139+
assert result["labels"].count(-100) == 11 + 12
140+
else:
141+
assert result["labels"][:38] == [-100] * 38
142+
assert result["labels"][43:54] == [-100] * 11
143+
assert result["labels"].count(-100) == 38 + 11

0 commit comments

Comments
 (0)