Skip to content

Commit 03ec74a

Browse files
authored
feat: Add tools and documents usage in chat template (#495)
* Add tools and documents usage in chat template Signed-off-by: Dushyant Behl <[email protected]> * fix fmt and lint Signed-off-by: Dushyant Behl <[email protected]> * Add unit test case for Granite3.1 chat template and dataset Signed-off-by: Dushyant Behl <[email protected]> --------- Signed-off-by: Dushyant Behl <[email protected]>
1 parent cd4fdde commit 03ec74a

File tree

7 files changed

+155
-20
lines changed

7 files changed

+155
-20
lines changed

tests/artifacts/predefined_data_configs/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@
3737
DATA_CONFIG_MULTITURN_DATA_YAML = os.path.join(
3838
PREDEFINED_DATA_CONFIGS, "multi_turn_data_with_chat_template.yaml"
3939
)
40+
DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML = os.path.join(
41+
PREDEFINED_DATA_CONFIGS, "multi_turn_data_with_chat_template_granite_3_1B.yaml"
42+
)
4043
DATA_CONFIG_YAML_STREAMING_INPUT_OUTPUT = os.path.join(
4144
PREDEFINED_DATA_CONFIGS, "tokenize_and_apply_input_masking_streaming.yaml"
4245
)

tests/artifacts/predefined_data_configs/multi_turn_data_with_chat_template.yaml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,24 @@ datasets:
1616
data_handlers:
1717
- name: apply_tokenizer_chat_template
1818
arguments:
19+
remove_columns: all
1920
fn_kwargs:
20-
dataset_text_field: formatted_chat_data
21+
dataset_text_field: "formatted_chat_data"
22+
- name: dataset_2
23+
data_paths:
24+
- "FILE_PATH"
25+
data_handlers:
26+
- name: apply_tokenizer_chat_template
27+
arguments:
28+
remove_columns: all
29+
fn_kwargs:
30+
dataset_text_field: "formatted_chat_data"
31+
- name: dataset_3
32+
data_paths:
33+
- "FILE_PATH"
34+
data_handlers:
35+
- name: apply_tokenizer_chat_template
36+
arguments:
37+
remove_columns: all
38+
fn_kwargs:
39+
dataset_text_field: "formatted_chat_data"
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
dataprocessor:
2+
type: default
3+
chat_template: |
4+
{%- if messages[0]['role'] == 'system' %}
5+
{%- set system_message = messages[0]['content'] %}
6+
{%- set loop_messages = messages[1:] %}
7+
{%- else %}
8+
{%- set system_message = "Knowledge Cutoff Date: April 2024.\nToday's Date: " + strftime_now('%B %d, %Y') + ".\nYou are Granite, developed by IBM." %}
9+
{%- if tools and documents %}
10+
{%- set system_message = system_message + " You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.\n\nWrite the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
11+
{%- elif tools %}
12+
{%- set system_message = system_message + " You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request." %}
13+
{%- elif documents %}
14+
{%- set system_message = system_message + " Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data." %}
15+
{%- else %}
16+
{%- set system_message = system_message + " You are a helpful AI assistant." %}
17+
{%- endif %}
18+
{%- if 'citations' in controls and documents %}
19+
{%- set system_message = system_message + '\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.' %}
20+
{%- endif %}
21+
{%- if 'hallucinations' in controls and documents %}
22+
{%- set system_message = system_message + '\n\nFinally, after the response is written, include a numbered list of sentences from the response that are potentially hallucinated and not based in the documents.' %}
23+
{%- endif %}
24+
{%- set loop_messages = messages %}
25+
{%- endif %}
26+
{{- '<|start_of_role|>system<|end_of_role|>' + system_message + '<|end_of_text|>\n' }}
27+
{%- if tools %}
28+
{{- '<|start_of_role|>tools<|end_of_role|>' }}
29+
{{- tools | tojson(indent=4) }}
30+
{{- '<|end_of_text|>\n' }}
31+
{%- endif %}
32+
{%- if documents %}
33+
{{- '<|start_of_role|>documents<|end_of_role|>' }}
34+
{%- for document in documents %}
35+
{{- 'Document ' + loop.index0 | string + '\n' }}
36+
{{- document['text'] }}
37+
{%- if not loop.last %}
38+
{{- '\n\n'}}
39+
{%- endif%}
40+
{%- endfor %}
41+
{{- '<|end_of_text|>\n' }}
42+
{%- endif %}
43+
{%- for message in loop_messages %}
44+
{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}
45+
{%- if loop.last and add_generation_prompt %}
46+
{{- '<|start_of_role|>assistant' }}
47+
{%- if controls %}
48+
{{- ' ' + controls | tojson()}}
49+
{%- endif %}
50+
{{- '<|end_of_role|>' }}
51+
{%- endif %}
52+
{%- endfor %}
53+
datasets:
54+
- name: dataset_1
55+
data_paths:
56+
- "FILE_PATH"
57+
data_handlers:
58+
- name: apply_tokenizer_chat_template
59+
arguments:
60+
remove_columns: all
61+
fn_kwargs:
62+
dataset_text_field: "formatted_chat_data"
63+
conversation_column: "messages"
64+
- name: dataset_2
65+
data_paths:
66+
- "FILE_PATH"
67+
data_handlers:
68+
- name: apply_tokenizer_chat_template
69+
arguments:
70+
remove_columns: all
71+
fn_kwargs:
72+
dataset_text_field: "formatted_chat_data"
73+
conversation_column: "messages"
74+
- name: dataset_3
75+
data_paths:
76+
- "FILE_PATH"
77+
data_handlers:
78+
- name: apply_tokenizer_chat_template
79+
arguments:
80+
remove_columns: all
81+
fn_kwargs:
82+
dataset_text_field: "formatted_chat_data"
83+
conversation_column: "messages"

tests/artifacts/testdata/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@
6868
)
6969
CHAT_DATA_SINGLE_TURN = os.path.join(JSONL_DATA_DIR, "single_turn_chat.jsonl")
7070
CHAT_DATA_MULTI_TURN = os.path.join(JSONL_DATA_DIR, "multi_turn_chat.jsonl")
71+
CHAT_DATA_MULTI_TURN_GRANITE_3_1B = os.path.join(
72+
JSONL_DATA_DIR, "multi_turn_chat_granite_instruct.jsonl"
73+
)
7174
EMPTY_DATA = os.path.join(JSON_DATA_DIR, "empty_data.json")
7275
MALFORMATTED_DATA = os.path.join(JSON_DATA_DIR, "malformatted_data.json")
7376

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{"messages": [{"content": "You are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.", "role": "system"}, {"content": "Can you think of a word that rhymes with tree?", "role": "user"}, {"content": "Of course, I can help! Here's one word that rhymes with \"tree\":\n1\\. Three", "role": "assistant"}, {"content": "Can you think of a word that rhymes with tree?", "role": "user"}, {"content": "Of course, I can help! Here's one word that rhymes with \"tree\":\n1\\. Three", "role": "assistant"}], "group": "lab_extension", "dataset": "base/full-extension", "metadata": "{\"num_turns\": 2}"}
2+
{"messages": [{"content": "Can you think of a word that rhymes with tree?", "role": "user"}, {"content": "Of course, I can help! Here's one word that rhymes with \"tree\":\n1\\. Three", "role": "assistant"}, {"content": "Can you think of a word that rhymes with tree?", "role": "user"}, {"content": "Of course, I can help! Here's one word that rhymes with \"tree\":\n1\\. Three", "role": "assistant"}], "group": "lab_extension", "dataset": "base/full-extension", "metadata": "{\"num_turns\": 2}"}
3+
{"tools": [{"name": "calculate_tip", "arguments": {"bill_amount": 50, "tip_percentage": 20}}, {"name": "payment", "arguments": {"type": "card"}}], "messages": [{"content": "Tell me one word that rhymes and has the opposite meaning of \"open\".", "role": "user"}, {"content": "Sure! Here's a word that rhymes with \"open\" and has the opposite meaning:\n1\\. Shut", "role": "assistant"}, {"content": "Tell me one word that rhymes and has the opposite meaning of \"open\".", "role": "user"}, {"content": "Sure! Here's a word that rhymes with \"open\" and has the opposite meaning:\n1\\. Shut", "role": "assistant"}], "group": "lab_extension", "dataset": "base/full-extension", "metadata": "{\"num_turns\": 2}"}
4+
{"documents": [{"id": "abc", "title": "Title 1", "text": "This is a sample text of the first document"}, {"id": "def", "title": "Title 2", "text": "This is a sample text of the second document"}], "messages": [{"content": "For the word \"dream\", give an example of a word that rhymes with it and its synonym.", "role": "user"}, {"content": "Here's an example for \"dream\" that includes a word that rhymes with it and a synonym:\n1\\. Word that rhymes with \"dream\": \"beam\"\nSynonym: \"ideal\"", "role": "assistant"}, {"content": "For the word \"dream\", give an example of a word that rhymes with it and its synonym.", "role": "user"}, {"content": "Here's an example for \"dream\" that includes a word that rhymes with it and a synonym:\n1\\. Word that rhymes with \"dream\": \"beam\"\nSynonym: \"ideal\"", "role": "assistant"}], "group": "lab_extension", "dataset": "base/full-extension", "metadata": "{\"num_turns\": 2}"}
5+
{"tools": [{"name": "calculate_tip", "arguments": {"bill_amount": 50, "tip_percentage": 20}}, {"name": "payment", "arguments": {"type": "card"}}], "documents": [{"id": "abc", "title": "Title 1", "text": "This is a sample text of the first document"}, {"id": "def", "title": "Title 2", "text": "This is a sample text of the second document"}], "messages": [{"content": "Using the word \"grace\", come up with a word that rhymes and has the same number of syllables\n<nopace>", "role": "user"}, {"content": "Certainly! Here's a word that rhymes with \"grace\" and has the same number of syllables:\n1\\. Space", "role": "assistant"}, {"content": "Using the word \"grace\", come up with a word that rhymes and has the same number of syllables\n<nopace>", "role": "user"}, {"content": "Certainly! Here's a word that rhymes with \"grace\" and has the same number of syllables:\n1\\. Space", "role": "assistant"}], "group": "lab_extension", "dataset": "base/full-extension", "metadata": "{\"num_turns\": 2}"}
6+
{"controls": ["citations"], "documents": [{"id": "abc", "title": "Title 1", "text": "This is a sample text of the first document"}, {"id": "def", "title": "Title 2", "text": "This is a sample text of the second document"}], "messages": [{"content": "For the word \"dream\", give an example of a word that rhymes with it and its synonym.", "role": "user"}, {"content": "Here's an example for \"dream\" that includes a word that rhymes with it and a synonym:\n1\\. Word that rhymes with \"dream\": \"beam\"\nSynonym: \"ideal\"", "role": "assistant"}, {"content": "For the word \"dream\", give an example of a word that rhymes with it and its synonym.", "role": "user"}, {"content": "Here's an example for \"dream\" that includes a word that rhymes with it and a synonym:\n1\\. Word that rhymes with \"dream\": \"beam\"\nSynonym: \"ideal\"", "role": "assistant"}], "group": "lab_extension", "dataset": "base/full-extension", "metadata": "{\"num_turns\": 2}"}
7+
{"controls": ["hallucinations"], "documents": [{"id": "abc", "title": "Title 1", "text": "This is a sample text of the first document"}, {"id": "def", "title": "Title 2", "text": "This is a sample text of the second document"}], "messages": [{"content": "For the word \"dream\", give an example of a word that rhymes with it and its synonym.", "role": "user"}, {"content": "Here's an example for \"dream\" that includes a word that rhymes with it and a synonym:\n1\\. Word that rhymes with \"dream\": \"beam\"\nSynonym: \"ideal\"", "role": "assistant"}, {"content": "For the word \"dream\", give an example of a word that rhymes with it and its synonym.", "role": "user"}, {"content": "Here's an example for \"dream\" that includes a word that rhymes with it and a synonym:\n1\\. Word that rhymes with \"dream\": \"beam\"\nSynonym: \"ideal\"", "role": "assistant"}], "group": "lab_extension", "dataset": "base/full-extension", "metadata": "{\"num_turns\": 2}"}
8+
{"controls": ["citations", "hallucinations"], "documents": [{"id": "abc", "title": "Title 1", "text": "This is a sample text of the first document"}, {"id": "def", "title": "Title 2", "text": "This is a sample text of the second document"}], "messages": [{"content": "For the word \"dream\", give an example of a word that rhymes with it and its synonym.", "role": "user"}, {"content": "Here's an example for \"dream\" that includes a word that rhymes with it and a synonym:\n1\\. Word that rhymes with \"dream\": \"beam\"\nSynonym: \"ideal\"", "role": "assistant"}, {"content": "For the word \"dream\", give an example of a word that rhymes with it and its synonym.", "role": "user"}, {"content": "Here's an example for \"dream\" that includes a word that rhymes with it and a synonym:\n1\\. Word that rhymes with \"dream\": \"beam\"\nSynonym: \"ideal\"", "role": "assistant"}], "group": "lab_extension", "dataset": "base/full-extension", "metadata": "{\"num_turns\": 2}"}

tests/test_sft_trainer.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
DATA_CONFIG_DUPLICATE_COLUMNS,
4141
DATA_CONFIG_MULTIPLE_DATASETS_SAMPLING_YAML,
4242
DATA_CONFIG_MULTITURN_DATA_YAML,
43+
DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML,
4344
DATA_CONFIG_RENAME_RETAIN_COLUMNS,
4445
DATA_CONFIG_SKIP_LARGE_TEXT_HANDLER,
4546
DATA_CONFIG_TOKENIZE_AND_APPLY_INPUT_MASKING_YAML,
@@ -49,6 +50,7 @@
4950
)
5051
from tests.artifacts.testdata import (
5152
CHAT_DATA_MULTI_TURN,
53+
CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
5254
CHAT_DATA_SINGLE_TURN,
5355
CUSTOM_TOKENIZER_TINYLLAMA,
5456
EMPTY_DATA,
@@ -1244,7 +1246,15 @@ def test_run_chat_style_ft_using_dataconfig(datafiles, dataconfigfile):
12441246
(
12451247
[CHAT_DATA_SINGLE_TURN, CHAT_DATA_MULTI_TURN, CHAT_DATA_SINGLE_TURN],
12461248
DATA_CONFIG_MULTITURN_DATA_YAML,
1247-
)
1249+
),
1250+
(
1251+
[
1252+
CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
1253+
CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
1254+
CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
1255+
],
1256+
DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML,
1257+
),
12481258
],
12491259
)
12501260
def test_run_chat_style_ft_using_dataconfig_for_chat_template(
@@ -1255,20 +1265,18 @@ def test_run_chat_style_ft_using_dataconfig_for_chat_template(
12551265
with tempfile.TemporaryDirectory() as tempdir:
12561266

12571267
data_args = copy.deepcopy(DATA_ARGS)
1258-
data_args.response_template = "<|assistant|>"
1259-
data_args.instruction_template = "<|user|>"
1260-
data_args.dataset_text_field = "new_formatted_field"
1261-
1262-
handler_kwargs = {"dataset_text_field": data_args.dataset_text_field}
1263-
kwargs = {
1264-
"fn_kwargs": handler_kwargs,
1265-
"batched": False,
1266-
"remove_columns": "all",
1267-
}
1268-
1269-
handler_config = DataHandlerConfig(
1270-
name="apply_tokenizer_chat_template", arguments=kwargs
1271-
)
1268+
if dataconfigfile == DATA_CONFIG_MULTITURN_GRANITE_3_1B_DATA_YAML:
1269+
data_args.response_template = "<|start_of_role|>assistant<|end_of_role|>"
1270+
data_args.instruction_template = "<|start_of_role|>user<|end_of_role|>"
1271+
data_args.add_special_tokens = [
1272+
"<|start_of_role|>assistant<|end_of_role|>",
1273+
"<|start_of_role|>user<|end_of_role|>",
1274+
]
1275+
elif dataconfigfile == DATA_CONFIG_MULTITURN_DATA_YAML:
1276+
data_args.response_template = "<|assistant|>"
1277+
data_args.instruction_template = "<|user|>"
1278+
1279+
data_args.dataset_text_field = "formatted_chat_data"
12721280

12731281
model_args = copy.deepcopy(MODEL_ARGS)
12741282
model_args.tokenizer_name_or_path = CUSTOM_TOKENIZER_TINYLLAMA
@@ -1284,8 +1292,6 @@ def test_run_chat_style_ft_using_dataconfig_for_chat_template(
12841292
datasets = data["datasets"]
12851293
for i, d in enumerate(datasets):
12861294
d["data_paths"] = [datafiles[i]]
1287-
# Basic chat datasets don't need data handling
1288-
d["data_handlers"] = [asdict(handler_config)]
12891295
yaml.dump(data, temp_yaml_file)
12901296
data_args.data_config_path = temp_yaml_file.name
12911297

tuning/data/data_handlers.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,14 +260,17 @@ def apply_tokenizer_chat_template(
260260
element: Dict[str, str],
261261
tokenizer: AutoTokenizer,
262262
dataset_text_field: str,
263+
conversation_column: str = None,
263264
**kwargs,
264265
):
265266
"""Function (data handler) to apply tokenizers chat template to dataset elements.
267+
Does not tokenize the dataset.
266268
Expects to be run as a HF Map API function.
267269
Args:
268270
element: the HF Dataset element.
269271
tokenizer: Tokenizer to be used.
270-
dataset_text_field: formatted_dataset_field.
272+
dataset_text_field: the field in which to store the rendered text.
273+
conversation_column: column name where the chat template expects the conversation
271274
Returns:
272275
Formatted HF Dataset element by formatting dataset with tokenizer's chat template
273276
Saves the result to dataset_text_field argument.
@@ -277,8 +280,18 @@ def apply_tokenizer_chat_template(
277280
"Tokenizer does not contain tokenizer.chat_template\
278281
please pass data_args.chat_template"
279282
)
283+
if conversation_column:
284+
converation = element[conversation_column]
285+
else:
286+
converation = element
287+
288+
tools = element["tools"] if "tools" in element else None
289+
documents = element["documents"] if "documents" in element else None
290+
280291
return {
281-
f"{dataset_text_field}": tokenizer.apply_chat_template(element, tokenize=False)
292+
f"{dataset_text_field}": tokenizer.apply_chat_template(
293+
converation, tools=tools, documents=documents, tokenize=False
294+
)
282295
}
283296

284297

0 commit comments

Comments
 (0)