Release v1.5.0 (#918)

HYLcool · cyruszhang · gemini-code-assist[bot] · web-flow · commit 2e62d2a8d8fb · 2026-02-26T13:02:08.000+08:00
* * update version and doc

* * ignore the first two items in the shape and focus on the specific shapes

* + add model_params to text_tagging_by_prompt_mapper
+ flush the buffer when outputting the trace results and wait for 1 sec

* bugfix: use /workspace for shared access in ray-head and ray-worker

* Update data_juicer/ops/mapper/text_tagging_by_prompt_mapper.py

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;

---------

Co-authored-by: cyruszhang &lt;cyrus.ylzhang@gmail.com&gt;
Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,9 @@ tests/tools/tmp_*/
 tests/ops/deduplicator/chinese_dedup/
 tests/ops/deduplicator/english_dedup/
 
+# temp directory for distributed Ray tests (shared between containers)
+tmp/
+
 
 # perf bench data
 perf_bench_data/
diff --git a/README.md b/README.md
@@ -86,6 +86,15 @@ for s in res_ds:
 
 ## 📰 News
 
+<details open>
+<summary>[2026-02-12] Release v1.5.0: <b>Partitioned Ray Executor, OP-level Env Management, and More Embodied-AI OPs</b></summary>
+
+- 🚀 *Enhanced Distributed Execution Framework* -- Introduced partitioned Ray executor and OP-level isolated environments to improve fault tolerance, scalability, and dependency conflict resolution.
+- 🤖 *Expanded Embodied AI Video Processing* -- Added specialized operators for camera calibration, video undistortion, hand reconstruction, and pose estimation to strengthen multi-view video handling.
+- 💪🏻 *System Performance & Developer Experience Optimizations* -- Enabled batch inference, memory/log reduction, core logic refactoring, and updated documentation/templates.
+- 🐳 *Critical Bug Fixes & Stability Improvements* -- Resolved duplicate tracking, parameter conflicts, homepage rendering issues, and outdated docs for higher reliability.
+</details>
+
 <details open>
 <summary>[2026-02-02] Release v1.4.6: <b>Copilot, Video Bytes I/O & Ray Tracing </b></summary>
 
@@ -96,7 +105,7 @@ for s in res_ds:
 - 🐳 *Enhancements & fixes* — refreshed Docker image, small perf boosts, GitHub Insights traffic workflow, Ray compatibility updates, and bug/doc fixes.
 </details>
 
-<details open>
+<details>
 <summary>[2026-01-15] Release v1.4.5: <b>20+ New OPs, Ray vLLM Pipelines & Sphinx Docs Upgrade</b> </summary>
 
 - *Embodied-AI OPs*: added/enhanced mappers for video captioning (VLM), video object segmentation (YOLOE+SAM2), video depth estimation (viz + point cloud), human pose (MMPose), image tagging (VLM), single-image 3D body mesh recovery (SAM 3D Body), plus *S3 upload/download*.
diff --git a/README_ZH.md b/README_ZH.md
@@ -85,6 +85,15 @@ for s in res_ds:
 
 ## 📰 动态
 
+<details open>
+<summary>[2026-02-12] Release v1.5.0: <b>分区Ray执行器，OP级环境隔离，以及更多具身算子</b></summary>
+
+- 🚀 *分布式执行框架升级* — 新增分区Ray执行器与OP级隔离环境，强化容错性、可扩展性及依赖冲突管理。
+- 🤖 *具身AI视频处理能力扩展* — 集成相机校准、视频去畸变、手部重建、位姿估计等专用操作符，提升多视角视频处理能力。
+- 💪🏻 *系统性能与开发体验优化* — 支持批处理推理、内存/日志精简、关键逻辑重构，并更新文档与问题模板。
+- 🐳 *关键问题修复与稳定性提升* — 修复重复项追踪、参数冲突、首页渲染等缺陷，增强系统可靠性。
+</details>
+
 <details open>
 <summary>[2026-02-02] Release v1.4.6: <b>Copilot、视频字节 I/O 与 Ray 追踪</b></summary>
 
@@ -95,7 +104,7 @@ for s in res_ds:
 - 🐳 *增强与修复* — 刷新 Docker 镜像、小幅性能提升、GitHub Insights 流量工作流、Ray 兼容性更新以及 Bug/文档修复。
 </details>
 
-<details open>
+<details >
 <summary>[2026-01-15] Release v1.4.5: <b>20+ 新 OP、Ray vLLM 管道与 Sphinx 文档升级</b> </summary>
 
 - *具身 AI OP*：添加/增强了用于视频标题生成（VLM）、视频对象分割（YOLOE+SAM2）、视频深度估计（可视化 + 点云）、人体姿态（MMPose）、图像标签（VLM）、单图像 3D 人体网格恢复（SAM 3D Body）的映射器，以及 *S3 上传/下载*。
diff --git a/data_juicer/__init__.py b/data_juicer/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.4.6"
+__version__ = "1.5.0"
 
 import sys
 
diff --git a/data_juicer/core/tracer/ray_tracer.py b/data_juicer/core/tracer/ray_tracer.py
@@ -167,5 +167,7 @@ def finalize_traces(self):
             # We'll use a generic name for now, could be improved with operator type detection
             res_name = self.get_trace_file_path(op_name)
             dif_df = pd.DataFrame(traces)
-            dif_df.to_json(res_name, orient="records", lines=True, force_ascii=False)
+            with open(res_name, "w") as out_buf:
+                dif_df.to_json(out_buf, orient="records", lines=True, force_ascii=False)
+                out_buf.flush()
             print(f"Exported {len(traces)} traced samples for op [{op_name}] to {res_name}")
diff --git a/data_juicer/ops/mapper/text_tagging_by_prompt_mapper.py b/data_juicer/ops/mapper/text_tagging_by_prompt_mapper.py
@@ -74,6 +74,7 @@ def __init__(
         tensor_parallel_size: int = None,
         max_model_len: int = None,
         max_num_seqs: int = 256,
+        model_params: Dict = None,
         sampling_params: Dict = None,
         *args,
         **kwargs,
@@ -93,6 +94,7 @@ def __init__(
             derived from the model config.
         :param max_num_seqs: It is only valid when enable_vllm is True.
             Maximum number of sequences to be processed in a single iteration.
+        :param model_params: Parameters for model initialization.
         :param sampling_params: Sampling parameters for text generation.
             e.g {'temperature': 0.9, 'top_p': 0.95}
         :param args: extra args
@@ -117,7 +119,8 @@ def __init__(
         self.prompt = prompt
         self.tag_list = tag_list
         self.enable_vllm = enable_vllm
-        model_params = {"trust_remote_code": trust_remote_code, "max_num_seqs": max_num_seqs}
+        model_params = (model_params or {}).copy()
+        model_params.update({"trust_remote_code": trust_remote_code, "max_num_seqs": max_num_seqs})
         if tensor_parallel_size is not None:
             model_params["tensor_parallel_size"] = tensor_parallel_size
         if max_model_len is not None:
diff --git a/tests/core/executor/test_partitioned_integration.py b/tests/core/executor/test_partitioned_integration.py
@@ -18,6 +18,7 @@
 import shutil
 import tempfile
 import unittest
+import uuid
 
 from data_juicer.config import init_configs
 from data_juicer.core.executor.ray_executor_partitioned import PartitionedRayExecutor
@@ -31,7 +32,12 @@ class PartitionedExecutorIntegrationTest(DataJuicerTestCaseBase):
 
     def setUp(self) -> None:
         super().setUp()
-        self.tmp_dir = tempfile.mkdtemp(prefix='test_partitioned_integration_')
+        # Use a shared directory under root_path instead of system /tmp
+        # This ensures the temp directory is accessible by all Ray workers
+        # in distributed mode (e.g., Docker containers sharing /workspace)
+        unique_name = f'test_partitioned_integration_{uuid.uuid4().hex[:8]}'
+        self.tmp_dir = os.path.join(self.root_path, 'tmp', unique_name)
+        os.makedirs(self.tmp_dir, exist_ok=True)
 
     def tearDown(self) -> None:
         super().tearDown()
@@ -458,7 +464,12 @@ class CheckpointResumeIntegrationTest(DataJuicerTestCaseBase):
 
     def setUp(self) -> None:
         super().setUp()
-        self.tmp_dir = tempfile.mkdtemp(prefix='test_ckpt_resume_')
+        # Use a shared directory under root_path instead of system /tmp
+        # This ensures the temp directory is accessible by all Ray workers
+        # in distributed mode (e.g., Docker containers sharing /workspace)
+        unique_name = f'test_ckpt_resume_{uuid.uuid4().hex[:8]}'
+        self.tmp_dir = os.path.join(self.root_path, 'tmp', unique_name)
+        os.makedirs(self.tmp_dir, exist_ok=True)
 
     def tearDown(self) -> None:
         super().tearDown()
diff --git a/tests/core/executor/test_ray_executor_partitioned.py b/tests/core/executor/test_ray_executor_partitioned.py
@@ -1,6 +1,9 @@
 import os
+import shutil
 import tempfile
 import unittest
+import uuid
+
 from data_juicer.core.executor.ray_executor_partitioned import PartitionedRayExecutor
 from data_juicer.config import init_configs
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG
@@ -11,13 +14,16 @@ class PartitionedRayExecutorTest(DataJuicerTestCaseBase):
 
     def setUp(self) -> None:
         super().setUp()
-        # Create temporary directory
-        self.tmp_dir = tempfile.mkdtemp(prefix='test_ray_executor_partitioned_')
+        # Use a shared directory under root_path instead of system /tmp
+        # This ensures the temp directory is accessible by all Ray workers
+        # in distributed mode (e.g., Docker containers sharing /workspace)
+        unique_name = f'test_ray_executor_partitioned_{uuid.uuid4().hex[:8]}'
+        self.tmp_dir = os.path.join(self.root_path, 'tmp', unique_name)
+        os.makedirs(self.tmp_dir, exist_ok=True)
 
     def tearDown(self) -> None:
         super().tearDown()
         # Clean up temporary directory
-        import shutil
         if os.path.exists(self.tmp_dir):
             shutil.rmtree(self.tmp_dir)
 
@@ -537,11 +543,15 @@ class PartitionedRayExecutorEdgeCasesTest(DataJuicerTestCaseBase):
 
     def setUp(self) -> None:
         super().setUp()
-        self.tmp_dir = tempfile.mkdtemp(prefix='test_ray_executor_edge_')
+        # Use a shared directory under root_path instead of system /tmp
+        # This ensures the temp directory is accessible by all Ray workers
+        # in distributed mode (e.g., Docker containers sharing /workspace)
+        unique_name = f'test_ray_executor_edge_{uuid.uuid4().hex[:8]}'
+        self.tmp_dir = os.path.join(self.root_path, 'tmp', unique_name)
+        os.makedirs(self.tmp_dir, exist_ok=True)
 
     def tearDown(self) -> None:
         super().tearDown()
-        import shutil
         if os.path.exists(self.tmp_dir):
             shutil.rmtree(self.tmp_dir)
 
diff --git a/tests/core/tracer/test_ray_tracer.py b/tests/core/tracer/test_ray_tracer.py
@@ -2,6 +2,7 @@
 import unittest
 import tempfile
 import shutil
+import time
 import jsonlines as jl
 from data_juicer.core.tracer.ray_tracer import RayTracer
 from data_juicer.utils.unittest_utils import TEST_TAG
@@ -58,6 +59,7 @@ def test_collect_mapper_sample_basic(self):
         
         # Finalize traces to write to file
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         trace_file_path = os.path.join(self.work_dir, 'trace', 'sample_trace-test_mapper.jsonl')
         self.assertTrue(os.path.exists(trace_file_path))
@@ -87,6 +89,7 @@ def test_collect_mapper_sample_no_change(self):
         
         # Finalize traces to write to file
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         trace_file_path = os.path.join(self.work_dir, 'trace', 'sample_trace-test_mapper.jsonl')
         # File should not exist since no samples were collected
@@ -105,6 +108,7 @@ def test_collect_mapper_sample_with_trace_keys(self):
         
         # Finalize traces to write to file
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         trace_file_path = os.path.join(self.work_dir, 'trace', 'sample_trace-test_mapper.jsonl')
         self.assertTrue(os.path.exists(trace_file_path))
@@ -135,6 +139,7 @@ def test_collect_mapper_sample_with_missing_trace_keys(self):
         
         # Finalize traces to write to file
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         trace_file_path = os.path.join(self.work_dir, 'trace', 'sample_trace-test_mapper.jsonl')
         self.assertTrue(os.path.exists(trace_file_path))
@@ -166,6 +171,7 @@ def test_collect_mapper_sample_not_in_op_list(self):
         
         # Finalize traces to write to file
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         trace_file_path = os.path.join(self.work_dir, 'trace', 'sample_trace-test_mapper.jsonl')
         self.assertFalse(os.path.exists(trace_file_path))
@@ -183,6 +189,7 @@ def test_collect_filter_sample_basic(self):
         
         # Finalize traces to write to file
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         trace_file_path = os.path.join(self.work_dir, 'trace', 'sample_trace-test_filter.jsonl')
         self.assertTrue(os.path.exists(trace_file_path))
@@ -208,6 +215,7 @@ def test_collect_filter_sample_should_keep(self):
         
         # Finalize traces to write to file
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         trace_file_path = os.path.join(self.work_dir, 'trace', 'sample_trace-test_filter.jsonl')
         self.assertFalse(os.path.exists(trace_file_path))
@@ -225,6 +233,7 @@ def test_collect_filter_sample_not_in_op_list(self):
         
         # Finalize traces to write to file
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         trace_file_path = os.path.join(self.work_dir, 'trace', 'sample_trace-test_filter.jsonl')
         self.assertFalse(os.path.exists(trace_file_path))
@@ -254,6 +263,7 @@ def test_collect_mapper_sample_show_num_limit(self):
         
         # Finalize traces to write to file
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         trace_file_path = os.path.join(self.work_dir, 'trace', 'sample_trace-limited_mapper.jsonl')
         self.assertTrue(os.path.exists(trace_file_path))
@@ -288,6 +298,7 @@ def test_collect_filter_sample_show_num_limit(self):
         
         # Finalize traces to write to file
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         trace_file_path = os.path.join(self.work_dir, 'trace', 'sample_trace-limited_filter.jsonl')
         self.assertTrue(os.path.exists(trace_file_path))
@@ -327,6 +338,7 @@ def test_finalize_traces_empty(self):
         
         # Don't collect anything, just finalize
         ray.get(tracer.finalize_traces.remote())
+        time.sleep(1)
         
         # No trace files should exist
         trace_dir = os.path.join(self.work_dir, 'trace')
diff --git a/tests/ops/mapper/test_text_tagging_by_prompt_mapper.py b/tests/ops/mapper/test_text_tagging_by_prompt_mapper.py
@@ -52,7 +52,9 @@ def test_tagging_vllm(self):
             enable_vllm=True,
             max_model_len=1024,
             max_num_seqs=16,
-            sampling_params={'temperature': 0.1, 'top_p': 0.95, 'max_tokens': 256})
+            sampling_params={'temperature': 0.1, 'top_p': 0.95, 'max_tokens': 256},
+            model_params={'gpu_memory_utilization': 0.8},
+        )
 
 
 if __name__ == '__main__':
diff --git a/tests/ops/mapper/test_vggt_mapper.py b/tests/ops/mapper/test_vggt_mapper.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "1.4.6"`
	`1`	`+__version__ = "1.5.0"`
`2`	`2`
`3`	`3`	`import sys`
`4`	`4`