1414from  datasets  import  Dataset , Video , load_dataset 
1515
1616from  fastvideo .configs .configs  import  DatasetType , PreprocessConfig 
17+ < << << <<  HEAD 
1718from  fastvideo .distributed .parallel_state  import  get_world_rank , get_world_size 
19+ == == == = 
20+ >> >> >> >  15 df36ab  ([Feat ][Preprocess ] support  merged  dataset  (#752)) 
1821from  fastvideo .logger  import  init_logger 
1922from  fastvideo .pipelines .pipeline_batch_info  import  PreprocessBatch 
2023
@@ -402,13 +405,19 @@ def _default_file_writer_fn(self, args_tuple: tuple) -> int:
402405        return  written_count 
403406
404407
408+ < << << <<  HEAD 
405409def  build_dataset (preprocess_config : PreprocessConfig , split : str ,
406410                  validator : Callable [[dict [str , Any ]], bool ]) ->  Dataset :
407411    if  preprocess_config .dataset_type  ==  DatasetType .HF :
408412        dataset  =  load_dataset (preprocess_config .dataset_path , split = split )
409413        dataset  =  dataset .filter (validator )
410414        dataset  =  dataset .shard (num_shards = get_world_size (),
411415                                index = get_world_rank ())
416+ == == == = 
417+ def  build_dataset (preprocess_config : PreprocessConfig , split : str ) ->  Dataset :
418+     if  preprocess_config .dataset_type  ==  DatasetType .HF :
419+         dataset  =  load_dataset (preprocess_config .dataset_path , split = split )
420+ > >> >> >>  15 df36ab  ([Feat ][Preprocess ] support  merged  dataset  (#752)) 
412421    elif  preprocess_config .dataset_type  ==  DatasetType .MERGED :
413422        metadata_json_path  =  os .path .join (preprocess_config .dataset_path ,
414423                                          "videos2caption.json" )
@@ -422,11 +431,14 @@ def build_dataset(preprocess_config: PreprocessConfig, split: str,
422431            dataset  =  dataset .rename_column ("cap" , "caption" )
423432        if  "path"  in  column_names :
424433            dataset  =  dataset .rename_column ("path" , "name" )
434+ << << < <<  HEAD 
425435
426436        dataset  =  dataset .filter (validator )
427437        dataset  =  dataset .shard (num_shards = get_world_size (),
428438                                index = get_world_rank ())
429439
440+ == == == = 
441+ >> > >> > >  15 df36ab  ([Feat ][Preprocess ] support  merged  dataset  (#752)) 
430442        # add video column 
431443        def  add_video_column (item : dict [str , Any ]) - >  dict [str , Any ]:
432444            item ["video" ] =  os .path .join (video_folder , item ["name" ])
0 commit comments