@@ -286,6 +286,11 @@ def filter():
286286 # dataset.dump(save_path)
287287 result [recorder ] = True
288288 result = result .tolist ()
289+ save_path = cfg ['save_path' ]
290+ from bitarray import bitarray
291+ ba = bitarray (result )
292+ with open (save_path , 'wb' ) as f :
293+ ba .tofile (f )
289294 print (json .dumps ({"bool" : result }))
290295
291296def refine ():
@@ -299,7 +304,7 @@ def refine():
299304 if isinstance (cfg .yaml , str ):
300305 with open (cfg .yaml , 'r' ) as f :
301306 cfg .yaml = yaml .safe_load (f ) # 解析成字典
302-
307+
303308 for scorer_name , args in cfg .yaml .items ():
304309 if "num_workers" in cfg :
305310 args ["num_workers" ] = cfg .num_workers
@@ -329,7 +334,7 @@ def deduplicate():
329334 if isinstance (cfg .yaml , str ):
330335 with open (cfg .yaml , 'r' ) as f :
331336 cfg .yaml = yaml .safe_load (f ) # 解析成字典
332-
337+ result = []
333338 for scorer_name , args in cfg .yaml .items ():
334339 if "num_workers" in cfg :
335340 args ["num_workers" ] = cfg .num_workers
@@ -342,9 +347,11 @@ def deduplicate():
342347 dataset_dict [processor .data_type ] = datasets
343348 else :
344349 datasets = dataset_dict [processor .data_type ]
345- processed_dataset = processor (datasets )
346- dataset_dict [processor .data_type ] = processed_dataset
347- print (processed_dataset )
350+ result .append (processor (datasets ))
351+ # dataset_dict[processor.data_type] = processed_dataset
352+ save_path = cfg ['save_path' ]
353+ with open (save_path , 'w' ) as f :
354+ json .dump (result , f )
348355
349356
350357
0 commit comments