-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_test.jsonnet
More file actions
56 lines (49 loc) · 1.37 KB
/
data_test.jsonnet
File metadata and controls
56 lines (49 loc) · 1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
local MODELS = {
'llama2-7b': {
key: "meta-llama/Llama-2-7b-hf",
type: 'decoder',
layers: [16],
batch_size: 16,
revision: "main",
layer_template: 'model.layers.{}',
},
};
local DATA = import 'data.libsonnet';
local SPLIT = 'train';
local model_key = std.extVar('model');
local model_config = MODELS[model_key];
# load model and tokenizer
local tokenizer_step = {
[model_key + "-tokenizer"]: {
type: "transformers::AutoTokenizer::from_pretrained::step",
pretrained_model_name_or_path: model_config['key'],
}
};
# load data
local data_steps = {
[kv['key']]: {
type: "load_data",
dataset_name_or_path: kv['value']['name'],
split: SPLIT,
tokenizer: {ref: model_key + '-tokenizer'},
dataset_config_name: kv['value']['config'],
prompt_template: kv['value']['prompt'],
model_type: model_config['type'],
find_prev_answer_tokens: false,
}
for kv in std.objectKeysValues(DATA)
};
local local_store = {
["data_dump|" + kv['key']]: {
type: "dump_data",
dataset: {ref: kv['key']},
filepath: "local_outputs/data/dump_" + kv['key'] + ".csv",
tokenizer: {ref: model_key + "-tokenizer"},
}
for kv in std.objectKeysValues(DATA)
};
{
"steps": tokenizer_step
+ data_steps
+ local_store
}