1414from datasets import Dataset , Video , load_dataset
1515
1616from fastvideo .configs .configs import DatasetType , PreprocessConfig
17- < << << << HEAD
18- from fastvideo .distributed .parallel_state import get_world_rank , get_world_size
19- == == == =
20- >> >> >> > 15 df36ab ([Feat ][Preprocess ] support merged dataset (#752))
2117from fastvideo .logger import init_logger
2218from fastvideo .pipelines .pipeline_batch_info import PreprocessBatch
2319
@@ -82,8 +78,6 @@ def __call__(self, batch: dict[str, Any]) -> bool:
8278
8379 def _validate_data_type (self , batch : dict [str , Any ]) -> bool :
8480 """Validate basic validity of data items"""
85- print ("-------------------------------" )
86- print (batch )
8781 return not (batch ["caption" ] is None or batch ["caption" ] == ""
8882 or "fps" not in batch or batch ["fps" ] is None or batch ["fps" ] <= 0
8983 or batch ["num_frames" ] is None or batch ["num_frames" ] <= 0 )
@@ -405,19 +399,10 @@ def _default_file_writer_fn(self, args_tuple: tuple) -> int:
405399 return written_count
406400
407401
408- < << << << HEAD
409- def build_dataset (preprocess_config : PreprocessConfig , split : str ,
410- validator : Callable [[dict [str , Any ]], bool ]) -> Dataset :
411- if preprocess_config .dataset_type == DatasetType .HF :
412- dataset = load_dataset (preprocess_config .dataset_path , split = split )
413- dataset = dataset .filter (validator )
414- dataset = dataset .shard (num_shards = get_world_size (),
415- index = get_world_rank ())
416- == == == =
402+
417403def build_dataset (preprocess_config : PreprocessConfig , split : str ) -> Dataset :
418404 if preprocess_config .dataset_type == DatasetType .HF :
419405 dataset = load_dataset (preprocess_config .dataset_path , split = split )
420- > >> >> >> 15 df36ab ([Feat ][Preprocess ] support merged dataset (#752))
421406 elif preprocess_config .dataset_type == DatasetType .MERGED :
422407 metadata_json_path = os .path .join (preprocess_config .dataset_path ,
423408 "videos2caption.json" )
@@ -431,14 +416,6 @@ def build_dataset(preprocess_config: PreprocessConfig, split: str) -> Dataset:
431416 dataset = dataset .rename_column ("cap" , "caption" )
432417 if "path" in column_names :
433418 dataset = dataset .rename_column ("path" , "name" )
434- << << < << HEAD
435-
436- dataset = dataset .filter (validator )
437- dataset = dataset .shard (num_shards = get_world_size (),
438- index = get_world_rank ())
439-
440- == == == =
441- >> > >> > > 15 df36ab ([Feat ][Preprocess ] support merged dataset (#752))
442419 # add video column
443420 def add_video_column (item : dict [str , Any ]) -> dict [str , Any ]:
444421 item ["video" ] = os .path .join (video_folder , item ["name" ])
0 commit comments