@@ -66,7 +66,7 @@ def __init__(self, meta: StorageConfig, config: BufferConfig):
6666 self .response_key = meta .format .response_key
6767 self .read_batch_size = config .read_batch_size
6868 self .dataset = _HFBatchReader (
69- load_dataset (meta .path , name = subset_name , split = self .split )
69+ load_dataset (meta .path , name = subset_name , split = self .split , trust_remote_code = True )
7070 ) # TODO: support resume
7171 self .data_iter = self .dataset .iter (self .read_batch_size , drop_last_batch = True )
7272 self .tokenizer = transformers .AutoTokenizer .from_pretrained (config .tokenizer_path )
@@ -143,7 +143,7 @@ def __init__(self, meta: StorageConfig, config: BufferConfig):
143143 self .rejected_key = meta .format .rejected_key
144144 self .read_batch_size = config .read_batch_size
145145 self .dataset = _HFBatchReader (
146- load_dataset (meta .path , name = subset_name , split = self .split )
146+ load_dataset (meta .path , name = subset_name , split = self .split , trust_remote_code = True )
147147 ) # TODO: support resume
148148 self .data_iter = self .dataset .iter (self .read_batch_size , drop_last_batch = True )
149149 self .tokenizer = transformers .AutoTokenizer .from_pretrained (config .tokenizer_path )
@@ -215,7 +215,7 @@ def __init__(self, meta: StorageConfig, config: BufferConfig):
215215 self .epoch = 0
216216 datasets .disable_caching ()
217217 self .dataset = _HFBatchReader (
218- load_dataset (meta .path , name = subset_name , split = self .split ),
218+ load_dataset (meta .path , name = subset_name , split = self .split , trust_remote_code = True ),
219219 max_epoch = self .meta .total_epochs if meta .task_type == TaskType .EXPLORE else 1 ,
220220 offset = self .meta .index ,
221221 )
@@ -266,7 +266,7 @@ def read(
266266class RawDataReader (BufferReader ):
267267 def __init__ (self , meta : StorageConfig , config : Optional [BufferConfig ]):
268268 self .returned = False
269- self .dataset = load_dataset (meta .path , name = meta .subset_name , split = meta .split )
269+ self .dataset = load_dataset (meta .path , name = meta .subset_name , split = meta .split , trust_remote_code = True )
270270
271271 def __len__ (self ):
272272 return len (self .dataset )
0 commit comments