1414from  datasets  import  Dataset , Video , load_dataset 
1515
1616from  fastvideo .configs .configs  import  DatasetType , PreprocessConfig 
17- < << << <<  HEAD 
18- from  fastvideo .distributed .parallel_state  import  get_world_rank , get_world_size 
19- == == == = 
20- >> >> >> >  15 df36ab  ([Feat ][Preprocess ] support  merged  dataset  (#752)) 
2117from  fastvideo .logger  import  init_logger 
2218from  fastvideo .pipelines .pipeline_batch_info  import  PreprocessBatch 
2319
@@ -82,10 +78,8 @@ def __call__(self, batch: dict[str, Any]) -> bool:
8278
8379    def  _validate_data_type (self , batch : dict [str , Any ]) ->  bool :
8480        """Validate basic validity of data items""" 
85-         print ("-------------------------------" )
86-         print (batch )
87-         return  not  (batch ["caption" ] is  None  or  batch ["caption" ] ==  "" 
88-                     or  "fps"  not  in batch  or  batch ["fps" ] is  None  or  batch ["fps" ] <=  0 
81+         return  not  (batch ["caption" ] is  None  or  batch ["caption" ] ==  ""  or  "fps" 
82+                     not  in batch  or  batch ["fps" ] is  None  or  batch ["fps" ] <=  0 
8983                    or  batch ["num_frames" ] is  None  or  batch ["num_frames" ] <=  0 )
9084
9185    def  _validate_resolution (self , batch : dict [str , Any ]) ->  bool :
@@ -405,19 +399,9 @@ def _default_file_writer_fn(self, args_tuple: tuple) -> int:
405399        return  written_count 
406400
407401
408- < << << <<  HEAD 
409- def  build_dataset (preprocess_config : PreprocessConfig , split : str ,
410-                   validator : Callable [[dict [str , Any ]], bool ]) ->  Dataset :
411-     if  preprocess_config .dataset_type  ==  DatasetType .HF :
412-         dataset  =  load_dataset (preprocess_config .dataset_path , split = split )
413-         dataset  =  dataset .filter (validator )
414-         dataset  =  dataset .shard (num_shards = get_world_size (),
415-                                 index = get_world_rank ())
416- == == == = 
417402def  build_dataset (preprocess_config : PreprocessConfig , split : str ) ->  Dataset :
418403    if  preprocess_config .dataset_type  ==  DatasetType .HF :
419404        dataset  =  load_dataset (preprocess_config .dataset_path , split = split )
420- > >> >> >>  15 df36ab  ([Feat ][Preprocess ] support  merged  dataset  (#752)) 
421405    elif  preprocess_config .dataset_type  ==  DatasetType .MERGED :
422406        metadata_json_path  =  os .path .join (preprocess_config .dataset_path ,
423407                                          "videos2caption.json" )
@@ -431,14 +415,6 @@ def build_dataset(preprocess_config: PreprocessConfig, split: str) -> Dataset:
431415            dataset  =  dataset .rename_column ("cap" , "caption" )
432416        if  "path"  in  column_names :
433417            dataset  =  dataset .rename_column ("path" , "name" )
434- << << < <<  HEAD 
435- 
436-         dataset  =  dataset .filter (validator )
437-         dataset  =  dataset .shard (num_shards = get_world_size (),
438-                                 index = get_world_rank ())
439- 
440- == == == = 
441- >> > >> > >  15 df36ab  ([Feat ][Preprocess ] support  merged  dataset  (#752)) 
442418        # add video column 
443419        def  add_video_column (item : dict [str , Any ]) ->  dict [str , Any ]:
444420            item ["video" ] =  os .path .join (video_folder , item ["name" ])
0 commit comments