@@ -36,7 +36,7 @@ class InTokensTestCommon:
36
36
expected_output = {
37
37
"input_ids" : [1 , 29871 , 30429 , 1 , 29871 , 30429 , 2 , 1 , 29871 , 31427 , 1 , 29871 , 31427 , 2 ],
38
38
"labels" : [- 100 , - 100 , - 100 , 1 , 29871 , 30429 , 2 , - 100 , - 100 , - 100 , 1 , 29871 , 31427 , 2 ],
39
- "position_ids" : [0 , 1 , 2 , 3 , 4 , 5 , 6 , 0 , 1 , 2 , 3 , 4 , 5 , 6 ],
39
+ "position_ids" : np . array ( [0 , 1 , 2 , 3 , 4 , 5 , 6 , 0 , 1 , 2 , 3 , 4 , 5 , 6 ]) ,
40
40
"attention_mask" : np .array (
41
41
[
42
42
[
@@ -57,25 +57,34 @@ class InTokensTestCommon:
57
57
]
58
58
]
59
59
),
60
+ "position_ids_2d" : [[0 , 1 , 2 , 3 , 4 , 5 , 6 , 0 , 1 , 2 , 3 , 4 , 5 , 6 ], [0 , 1 , 2 , 3 , 4 , 5 , 6 , 0 , 1 , 2 , 3 , 4 , 5 , 6 ]],
60
61
}
61
62
62
- def preprocess_fn (self , example , max_src_length = 3 , max_tgt_length = 3 ):
63
+ def preprocess_fn (
64
+ self ,
65
+ example ,
66
+ max_src_length = 3 ,
67
+ max_tgt_length = 3 ,
68
+ return_position_ids = True ,
69
+ position_ids_2d = False ,
70
+ return_attention_mask = True ,
71
+ ):
63
72
inputs = example ["sentence" ][:2 ]
64
73
model_inputs = self .tokenizer (inputs , max_length = max_src_length , truncation = True , return_attention_mask = False )
65
74
labels_input_ids = model_inputs ["input_ids" ] + [self .tokenizer .eos_token_id ]
66
75
model_inputs ["labels" ] = [- 100 ] * len (model_inputs ["input_ids" ]) + labels_input_ids
67
76
model_inputs ["input_ids" ] = model_inputs ["input_ids" ] + labels_input_ids
68
77
seq_length = len (model_inputs ["input_ids" ])
69
- model_inputs [ "position_ids" ] = list ( range ( seq_length ))
70
- model_inputs [ "attention_mask" ] = np . tril ( np . ones ([ seq_length , seq_length ]))
71
- return model_inputs
72
-
73
- def preprocess_fn_input_labels_only ( self , example , max_src_length = 3 , max_tgt_length = 3 ):
74
- inputs = example [ "sentence" ][: 2 ]
75
- model_inputs = self . tokenizer ( inputs , max_length = max_src_length , truncation = True , return_attention_mask = False )
76
- labels_input_ids = model_inputs ["input_ids " ] + [ self . tokenizer . eos_token_id ]
77
- model_inputs [ "labels" ] = [ - 100 ] * len ( model_inputs [ "input_ids" ]) + labels_input_ids
78
- model_inputs ["input_ids " ] = model_inputs [ "input_ids" ] + labels_input_ids
78
+ if return_position_ids :
79
+ if position_ids_2d :
80
+ position_ids = np . arange ( seq_length , dtype = np . int64 )
81
+ # fake block_position_ids with wrong values but correct shape
82
+ block_position_ids = np . arange ( seq_length , dtype = np . int64 )
83
+ model_inputs [ "position_ids" ] = np . stack ([ position_ids , block_position_ids ], axis = 0 )
84
+ else :
85
+ model_inputs ["position_ids " ] = list ( range ( seq_length ))
86
+ if return_attention_mask :
87
+ model_inputs ["attention_mask " ] = np . tril ( np . ones ([ seq_length , seq_length ]))
79
88
return model_inputs
80
89
81
90
@@ -89,10 +98,14 @@ def setUpClass(cls):
89
98
data_files = [os .path .join (fixture_path , "tnews" , "train.json" )],
90
99
lazy = False ,
91
100
)
92
- copy_train_ids = copy .deepcopy (cls .train_ds )
101
+ copy_dataset_1 = copy .deepcopy (cls .train_ds )
102
+ copy_dataset_2 = copy .deepcopy (cls .train_ds )
93
103
cls .dataset = cls .train_ds .map (lambda example : cls .preprocess_fn (cls , example ))
94
- cls .dataset_input_labels_only = copy_train_ids .map (
95
- lambda example : cls .preprocess_fn_input_labels_only (cls , example )
104
+ cls .dataset_position_2d = copy_dataset_1 .map (
105
+ lambda example : cls .preprocess_fn (cls , example , position_ids_2d = True )
106
+ )
107
+ cls .dataset_input_labels_only = copy_dataset_2 .map (
108
+ lambda example : cls .preprocess_fn (cls , example , return_position_ids = False , return_attention_mask = False )
96
109
)
97
110
98
111
def test_long_max_length (self ):
@@ -111,8 +124,8 @@ def test_long_max_length(self):
111
124
def test_short_max_length (self ):
112
125
inData = InTokensMapDataset (self .dataset , self .tokenizer , max_length = 16 )
113
126
self .assertEqual (inData [0 ]["input_ids" ], self .expected_output ["input_ids" ])
114
- self .assertEqual (inData [0 ]["position_ids" ], self .expected_output ["position_ids" ])
115
127
self .assertEqual (inData [0 ]["labels" ], self .expected_output ["labels" ])
128
+ self .assertTrue ((inData [0 ]["position_ids" ] == self .expected_output ["position_ids" ]).all ())
116
129
self .assertTrue ((inData [0 ]["attention_mask" ] == self .expected_output ["attention_mask" ]).all ())
117
130
118
131
inData_input_labels_only = InTokensMapDataset (self .dataset_input_labels_only , self .tokenizer , max_length = 16 )
@@ -122,6 +135,10 @@ def test_short_max_length(self):
122
135
(inData_input_labels_only [0 ]["attention_mask" ] == self .expected_output ["attention_mask" ]).all ()
123
136
)
124
137
138
+ def test_2d_position_id (self ):
139
+ inData_2d = InTokensMapDataset (self .dataset_position_2d , self .tokenizer , max_length = 16 )
140
+ self .assertTrue ((inData_2d [0 ]["position_ids" ] == self .expected_output ["position_ids_2d" ]).all ())
141
+
125
142
def test_missing_data (self ):
126
143
orginal_input_ids = [item ["input_ids" ] for item in self .dataset ]
127
144
orginal_input_ids = [sum (orginal_input_ids , [])]
@@ -138,10 +155,14 @@ def setUpClass(cls):
138
155
cls .train_ds = load_dataset (
139
156
read_local_dataset , path = os .path .join (fixture_path , "tnews" , "train.json" ), lazy = True
140
157
)
141
- copy_train_ids = copy .deepcopy (cls .train_ds )
158
+ copy_dataset_1 = copy .deepcopy (cls .train_ds )
159
+ copy_dataset_2 = copy .deepcopy (cls .train_ds )
142
160
cls .dataset = cls .train_ds .map (lambda example : cls .preprocess_fn (cls , example ))
143
- cls .dataset_input_labels_only = copy_train_ids .map (
144
- lambda example : cls .preprocess_fn_input_labels_only (cls , example )
161
+ cls .dataset_position_2d = copy_dataset_1 .map (
162
+ lambda example : cls .preprocess_fn (cls , example , position_ids_2d = True )
163
+ )
164
+ cls .dataset_input_labels_only = copy_dataset_2 .map (
165
+ lambda example : cls .preprocess_fn (cls , example , return_position_ids = False , return_attention_mask = False )
145
166
)
146
167
147
168
def test_long_max_length (self ):
@@ -174,8 +195,8 @@ def test_short_max_length(self):
174
195
example .append (item )
175
196
break
176
197
self .assertEqual (example [0 ]["input_ids" ], self .expected_output ["input_ids" ])
177
- self .assertEqual (example [0 ]["position_ids" ], self .expected_output ["position_ids" ])
178
198
self .assertEqual (example [0 ]["labels" ], self .expected_output ["labels" ])
199
+ self .assertTrue ((example [0 ]["position_ids" ] == self .expected_output ["position_ids" ]).all ())
179
200
self .assertTrue ((example [0 ]["attention_mask" ] == self .expected_output ["attention_mask" ]).all ())
180
201
181
202
inData_input_labels_only = InTokensIterableDataset (
@@ -189,6 +210,14 @@ def test_short_max_length(self):
189
210
self .assertEqual (example [0 ]["labels" ], self .expected_output ["labels" ])
190
211
self .assertTrue ((example [0 ]["attention_mask" ] == self .expected_output ["attention_mask" ]).all ())
191
212
213
+ def test_2d_position_id (self ):
214
+ inData_2d = InTokensIterableDataset (self .dataset_position_2d , self .tokenizer , max_length = 16 )
215
+ example = []
216
+ for item in inData_2d :
217
+ example .append (item )
218
+ break
219
+ self .assertTrue ((example [0 ]["position_ids" ] == self .expected_output ["position_ids_2d" ]).all ())
220
+
192
221
def test_missing_data (self ):
193
222
orginal_input_ids = [item ["input_ids" ] for item in self .dataset ]
194
223
orginal_input_ids = [sum (orginal_input_ids , [])]
0 commit comments