|
17 | 17 | from data_juicer.ops.base_op import DEFAULT_BATCH_SIZE, TAGGING_OPS |
18 | 18 | from data_juicer.utils.constant import Fields |
19 | 19 | from data_juicer.utils.file_utils import is_remote_path |
20 | | -from data_juicer.utils.resource_utils import cuda_device_count |
21 | 20 | from data_juicer.utils.webdataset_utils import _custom_default_decoder |
22 | 21 |
|
23 | 22 |
|
@@ -86,13 +85,6 @@ def preprocess_dataset(dataset: ray.data.Dataset, dataset_path, cfg) -> ray.data |
86 | 85 | return dataset |
87 | 86 |
|
88 | 87 |
|
89 | | -def get_num_gpus(op, op_proc): |
90 | | - if not op.use_cuda(): |
91 | | - return 0 |
92 | | - proc_per_gpu = op_proc / cuda_device_count() |
93 | | - return 1.0 / proc_per_gpu |
94 | | - |
95 | | - |
96 | 88 | def filter_batch(batch, filter_func): |
97 | 89 | mask = pyarrow.array(filter_func(batch.to_pydict())) |
98 | 90 | return batch.filter(mask) |
@@ -199,7 +191,20 @@ def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) - |
199 | 191 | cached_columns = set(columns_result) |
200 | 192 |
|
201 | 193 | for op in operators: |
202 | | - cached_columns = self._run_single_op(op, cached_columns, tracer=tracer) |
| 194 | + try: |
| 195 | + cached_columns = self._run_single_op(op, cached_columns, tracer=tracer) |
| 196 | + except Exception as e: |
| 197 | + logger.error(f"Error processing operator {op}: {e}.") |
| 198 | + if op.runtime_env is not None: |
| 199 | + logger.error("Try to fallback to the base runtime environment.") |
| 200 | + original_runtime_env = op.runtime_env |
| 201 | + try: |
| 202 | + op.runtime_env = None |
| 203 | + cached_columns = self._run_single_op(op, cached_columns, tracer=tracer) |
| 204 | + finally: |
| 205 | + op.runtime_env = original_runtime_env |
| 206 | + else: |
| 207 | + raise e |
203 | 208 | return self |
204 | 209 |
|
205 | 210 | def _run_single_op(self, op, cached_columns=None, tracer=None): |
|
0 commit comments