datajuicer
diff --git a/‎data_juicer/config/config.py‎
Lines changed: 22 additions & 0 deletions b/‎data_juicer/config/config.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎data_juicer/core/data/ray_dataset.py‎
Lines changed: 14 additions & 9 deletions b/‎data_juicer/core/data/ray_dataset.py‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎data_juicer/core/executor/ray_executor.py‎
Lines changed: 11 additions & 2 deletions b/‎data_juicer/core/executor/ray_executor.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎data_juicer/ops/__init__.py‎
Lines changed: 13 additions & 1 deletion b/‎data_juicer/ops/__init__.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎data_juicer/ops/base_op.py‎
Lines changed: 23 additions & 0 deletions b/‎data_juicer/ops/base_op.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎data_juicer/ops/load.py‎
Lines changed: 19 additions & 1 deletion b/‎data_juicer/ops/load.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎data_juicer/ops/mapper/image_tagging_mapper.py‎
Lines changed: 2 additions & 0 deletions b/‎data_juicer/ops/mapper/image_tagging_mapper.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data_juicer/ops/mapper/video_tagging_from_frames_mapper.py‎
Lines changed: 2 additions & 0 deletions b/‎data_juicer/ops/mapper/video_tagging_from_frames_mapper.py‎
Lines changed: 2 additions & 0 deletions
@@ -548,6 +548,28 @@ def init_configs(args: Optional[List[str]] = None, which_entry: object = None, l
                 "meta will be involved. Only available when filter_list_to_mine "
                 "is true.",
             )
+            parser.add_argument(
+                "--min_common_dep_num_to_combine",
+                type=int,
+                default=-1,
+                help="The minimum number of common dependencies required to determine whether to merge two operation "
+                "environment specifications. If set to -1, it means no combination of operation environments, where "
+                "every OP has its own runtime environment during processing without any merging. If set to >= 0, "
+                "environments of OPs that share at least min_common_dep_num_to_combine common dependencies will be "
+                "merged. It will open the operator environment manager to automatically analyze and merge runtime "
+                "environment for different OPs. It helps different OPs share and reuse the same runtime environment to "
+                "reduce resource utilization. It's -1 in default. Only available in ray mode. ",
+            )
+            parser.add_argument(
+                "--conflict_resolve_strategy",
+                type=str,
+                default="split",
+                choices=["split", "overwrite", "latest"],
+                help="Strategy for resolving dependency conflicts, default is 'split' strategy. 'split': Keep the two "
+                "specs split when there is a conflict. 'overwrite': Overwrite the existing dependency with one "
+                "from the later OP. 'latest': Use the latest version of all specified dependency versions. "
+                "Only available when min_common_dep_num_to_combine >= 0.",
+            )
             parser.add_argument(
                 "--op_fusion",
                 type=bool,
 
@@ -17,7 +17,6 @@
 from data_juicer.ops.base_op import DEFAULT_BATCH_SIZE, TAGGING_OPS
 from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import is_remote_path
-from data_juicer.utils.resource_utils import cuda_device_count
 from data_juicer.utils.webdataset_utils import _custom_default_decoder
 
 
@@ -86,13 +85,6 @@ def preprocess_dataset(dataset: ray.data.Dataset, dataset_path, cfg) -> ray.data
     return dataset
 
 
-def get_num_gpus(op, op_proc):
-    if not op.use_cuda():
-        return 0
-    proc_per_gpu = op_proc / cuda_device_count()
-    return 1.0 / proc_per_gpu
-
-
 def filter_batch(batch, filter_func):
     mask = pyarrow.array(filter_func(batch.to_pydict()))
     return batch.filter(mask)
@@ -199,7 +191,20 @@ def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) -
         cached_columns = set(columns_result)
 
         for op in operators:
-            cached_columns = self._run_single_op(op, cached_columns, tracer=tracer)
+            try:
+                cached_columns = self._run_single_op(op, cached_columns, tracer=tracer)
+            except Exception as e:
+                logger.error(f"Error processing operator {op}: {e}.")
+                if op.runtime_env is not None:
+                    logger.error("Try to fallback to the base runtime environment.")
+                    original_runtime_env = op.runtime_env
+                    try:
+                        op.runtime_env = None
+                        cached_columns = self._run_single_op(op, cached_columns, tracer=tracer)
+                    finally:
+                        op.runtime_env = original_runtime_env
+                else:
+                    raise e
         return self
 
     def _run_single_op(self, op, cached_columns=None, tracer=None):
 
@@ -13,7 +13,7 @@
 from data_juicer.core.executor.event_logging_mixin import EventLoggingMixin
 from data_juicer.core.ray_exporter import RayExporter
 from data_juicer.core.tracer.ray_tracer import RayTracer
-from data_juicer.ops import load_ops
+from data_juicer.ops import OPEnvManager, load_ops
 from data_juicer.ops.op_fusion import fuse_operators
 from data_juicer.utils.lazy_loader import LazyLoader
 
@@ -122,6 +122,15 @@ def __init__(self, cfg: Optional[Namespace] = None):
                 trace_keys=self.cfg.trace_keys,
             )
 
+        # setup OPEnvManager
+        self.op_env_manager = None
+        if self.cfg.min_common_dep_num_to_combine >= 0:
+            logger.info("Preparing OPEnvManager...")
+            self.op_env_manager = OPEnvManager(
+                min_common_dep_num_to_combine=self.cfg.min_common_dep_num_to_combine,
+                conflict_resolve_strategy=self.cfg.conflict_resolve_strategy,
+            )
+
     def run(self, load_data_np: Optional[PositiveInt] = None, skip_export: bool = False, skip_return: bool = False):
         """
         Running the dataset process pipeline
@@ -138,7 +147,7 @@ def run(self, load_data_np: Optional[PositiveInt] = None, skip_export: bool = Fa
 
         # 2. extract processes
         logger.info("Preparing process operators...")
-        ops = load_ops(self.cfg.process)
+        ops = load_ops(self.cfg.process, self.op_env_manager)
 
         # Initialize DAG execution planning (pass ops to avoid redundant loading)
         self._initialize_dag_execution(self.cfg, ops=ops)
 
@@ -30,6 +30,13 @@ def timing_context(description):
         Selector,
     )
     from .load import load_ops
+    from .op_env import (
+        OPEnvManager,
+        OPEnvSpec,
+        analyze_lazy_loaded_requirements,
+        analyze_lazy_loaded_requirements_for_code_file,
+        op_requirements_to_op_env_spec,
+    )
 
 __all__ = [
     'load_ops',
@@ -43,5 +50,10 @@ def timing_context(description):
     'NON_STATS_FILTERS',
     'OPERATORS',
     'TAGGING_OPS',
-    'Pipeline'
+    'Pipeline',
+    'OPEnvSpec',
+    'op_requirements_to_op_env_spec',
+    'OPEnvManager',
+    'analyze_lazy_loaded_requirements',
+    'analyze_lazy_loaded_requirements_for_code_file',
 ]
@@ -13,6 +13,12 @@
 from data_juicer.utils.registry import Registry
 from data_juicer.utils.resource_utils import is_cuda_available
 
+from .op_env import (
+    OPEnvSpec,
+    analyze_lazy_loaded_requirements_for_code_file,
+    op_requirements_to_op_env_spec,
+)
+
 OPERATORS = Registry("Operators")
 UNFORKABLE = Registry("Unforkable")
 NON_STATS_FILTERS = Registry("Non-stats Filters")
@@ -281,9 +287,20 @@ def __call__(cls, *args, **kwargs):
 
 
 class OP(metaclass=OPMetaClass):
+    # the name of this operator. Automatically set by the registry
+    _name = ""
+
+    # the accelerator to run this operator. Either "cpu" or "cuda"
     _accelerator = "cpu"
+
+    # whether this operator is a batched operator
     _batched_op = False
 
+    # extra requirements for this operator. Should be:
+    #   1. a list of packages
+    #   2. a string of the path to the requirements.txt file
+    _requirements = None
+
     def __init__(self, *args, **kwargs):
         """
         Base class of operators.
@@ -419,6 +436,12 @@ def __init__(self, *args, **kwargs):
                 method = wrap_func_with_nested_access(method)
                 setattr(self, name, method)
 
+    def get_env_spec(self) -> OPEnvSpec:
+        import inspect
+
+        auto_analyzed_requirements = analyze_lazy_loaded_requirements_for_code_file(inspect.getfile(self.__class__))
+        return op_requirements_to_op_env_spec(self._name, self._requirements, auto_analyzed_requirements)
+
     def use_auto_proc(self):
         if is_ray_mode() and not self.use_ray_actor():  # ray task
             return self.num_proc == -1
 
@@ -1,12 +1,14 @@
 from .base_op import OPERATORS
 
 
-def load_ops(process_list):
+def load_ops(process_list, op_env_manager=None):
     """
     Load op list according to the process list from config file.
 
     :param process_list: A process list. Each item is an op name and its
         arguments.
+    :param op_env_manager: The OPEnvManager to try to merge environment specs of different OPs that have common
+        dependencies. Only available when min_common_dep_num_to_combine >= 0.
     :return: The op instance list.
     """
     ops = []
@@ -21,4 +23,20 @@ def load_ops(process_list):
     for op_cfg, op in zip(new_process_list, ops):
         op._op_cfg = op_cfg
 
+    # update op runtime environment if OPEnvManager is enabled
+    if op_env_manager:
+        # first round: record and merge possible common env specs
+        for op in ops:
+            op_name = op._name
+            op_env_spec = op.get_env_spec()
+            op_env_manager.record_op_env_spec(op_name, op_env_spec)
+        # second round: update op runtime environment
+        for op in ops:
+            op_name = op._name
+            op_env_spec = op_env_manager.get_op_env_spec(op_name)
+            op._requirements = op_env_spec.pip_pkgs
+            # if the runtime_env is not set for this OP, update the runtime_env as well
+            if op.runtime_env is None:
+                op.runtime_env = op_env_spec.to_dict()
+
     return ops
@@ -3,6 +3,7 @@
 import numpy as np
 
 from data_juicer.utils.constant import Fields, MetaKeys
+from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.mm_utils import load_data_with_context, load_image
 from data_juicer.utils.model_utils import get_model, prepare_model, ram, torch
 
@@ -38,6 +39,7 @@ def __init__(self, tag_field_name: str = MetaKeys.image_tags, *args, **kwargs):
         """
         kwargs["memory"] = "9GB" if kwargs.get("memory", 0) == 0 else kwargs["memory"]
         super().__init__(*args, **kwargs)
+        LazyLoader.check_packages(["ram @ git+https://github.com/datajuicer/recognize-anything.git"])
         self.model_key = prepare_model(
             model_type="recognizeAnything", pretrained_model_name_or_path="ram_plus_swin_large_14m.pth", input_size=384
         )
 
@@ -4,6 +4,7 @@
 from pydantic import PositiveInt
 
 from data_juicer.utils.constant import Fields, MetaKeys
+from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.mm_utils import (
     close_video,
     extract_key_frames,
@@ -72,6 +73,7 @@ def __init__(
         """
         kwargs["memory"] = "9GB" if kwargs.get("memory", 0) == 0 else kwargs["memory"]
         super().__init__(*args, **kwargs)
+        LazyLoader.check_packages(["ram @ git+https://github.com/datajuicer/recognize-anything.git"])
         if frame_sampling_method not in ["all_keyframes", "uniform"]:
             raise ValueError(
                 f"Frame sampling method [{frame_sampling_method}] is not "