* fix wrong way to check the types of processors in task parser

HYLcool · HYLcool · commit c93ff10e8a74 · 2025-06-17T10:18:46.000+08:00
* fix: make dirs when initializing file writer
diff --git a/tests/test_configs/active_iterator_test_cfg.yaml b/tests/test_configs/active_iterator_test_cfg.yaml
@@ -2,8 +2,14 @@ data_processor:
   # basic info
   task_pipeline:
     input_buffers:
-      - path: 'tests/test_data/test_10/'
+      - name: 'raw_input'
+        path: 'tests/test_data/test_10/'
+        storage_type: 'file'
         raw: true
+    output_buffer:
+      name: 'raw_output'
+      path: './outputs/task_pipelien_output/processed.jsonl'
+      storage_type: 'file'
     format:
       prompt_key: 'problem'
       response_key: 'solution'
diff --git a/tests/test_configs/active_iterator_test_dj_cfg.yaml b/tests/test_configs/active_iterator_test_dj_cfg.yaml
@@ -1,7 +1,5 @@
 project_name: 'demo-process'
 
-export_path: './outputs/demo-process/demo-processed.jsonl'
-
 text_keys: 'solution'
 
 process:
diff --git a/trinity/buffer/writer/file_writer.py b/trinity/buffer/writer/file_writer.py
@@ -26,6 +26,8 @@ def __init__(self, meta: StorageConfig, config: BufferConfig):
         ext = os.path.splitext(meta.path)[-1]
         if ext != ".jsonl" and ext != ".json":
             raise ValueError(f"File path must end with .json or .jsonl, got {meta.path}")
+        path_dir = os.path.dirname(meta.path)
+        os.makedirs(path_dir, exist_ok=True)
         self.file = open(meta.path, "a", encoding="utf-8")
         self.encoder = _Encoder(ensure_ascii=False)
 
diff --git a/trinity/data/controllers/active_iterator.py b/trinity/data/controllers/active_iterator.py
@@ -159,13 +159,20 @@ def run(self):
                 traceback.print_exc()
                 return 7, "Tracking lineage failed."
 
-            # step 8. sort and export the result to the output buffer
+            # step 8
+            try:
+                if "priority" in res_dataset.data.features:
+                    res_dataset.sort_by("priority", reverse=True)
+            except Exception:
+                traceback.print_exc()
+                return 8, "Sorting results by priority failed."
+
+            # step 9. sort and export the result to the output buffer
             try:
-                res_dataset.sort_by("priority", reverse=True)
                 res_dataset.write_to_buffer()
             except Exception:
                 traceback.print_exc()
-                return 8, "Exporting result to output buffer failed."
+                return 9, "Exporting result to output buffer failed."
 
         return 0, "success"
 
@@ -247,7 +254,7 @@ def _compute_combined_score(
             difficulty = stats.get("difficulty_score", 0.5)
             score += self.priority_weights["difficulty"] * difficulty
 
-        sample["priority"] = score
+        sample["priority"] = [score]
         return sample
 
     def _compute_diversity_score(self) -> float:
diff --git a/trinity/data/controllers/task_parser.py b/trinity/data/controllers/task_parser.py
@@ -170,12 +170,15 @@ def _check_types_of_processors(self, dj_config):
         process_list = dj_config.get("process", [])
         for op in process_list:
             op_name = list(op.keys())[0]
-            if op_name in DEFAULT_CLEANER:
-                hit_cleaner = True
-            elif op_name in DEFAULT_SYNTHESIZER:
+            if op_name in DEFAULT_SYNTHESIZER:
                 hit_synthesizer = True
             elif op_name in DEFAULT_HUMAN_ANNOTATOR:
                 hit_human_annotator = True
+            else:
+                for dimension in DEFAULT_CLEANER:
+                    if op_name in DEFAULT_CLEANER[dimension]:
+                        hit_cleaner = True
+                        break
         return hit_cleaner, hit_synthesizer, hit_human_annotator
 
     def _update_common_op_args(self, dj_config: Namespace, extra_op_args: Dict) -> Namespace:
diff --git a/trinity/data/processors/cleaner.py b/trinity/data/processors/cleaner.py
@@ -166,15 +166,14 @@ def process(
         else:
             logger.info("Executing Data-Juicer analyzer...")
             analyzer = Analyzer(self.dj_cfg)
-            analyzer.run(dataset)
+            analyzer.run(dataset, skip_export=True)
             df = analyzer.overall_result
             mean_series = df[df.index == "mean"]
             stats_key_to_mean = mean_series.iloc[0, :].to_dict()
             std_series = df[df.index == "std"]
             stats_key_to_std = std_series.iloc[0, :].to_dict()
 
             tmp_cfg = copy.deepcopy(self.dj_cfg)
-            print(tmp_cfg)
             self.op_name_to_stats_key = StatsKeys.get_access_log(dj_cfg=tmp_cfg, dataset=dataset)
 
         for try_idx in range(max_tries):
diff --git a/trinity/data/server.py b/trinity/data/server.py
@@ -16,7 +16,7 @@ def data_workflow(pipeline_type):
     pipeline_type = escape(pipeline_type)
     config = load_config(config_path)
 
-    pipeline_config = getattr(config, pipeline_type)
+    pipeline_config = getattr(config.data_processor, pipeline_type)
     if pipeline_config is None:
         return jsonify(
             {

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ def data_workflow(pipeline_type):`
`16`	`16`	`pipeline_type = escape(pipeline_type)`
`17`	`17`	`config = load_config(config_path)`
`18`	`18`
`19`		`- pipeline_config = getattr(config, pipeline_type)`
	`19`	`+ pipeline_config = getattr(config.data_processor, pipeline_type)`
`20`	`20`	`if pipeline_config is None:`
`21`	`21`	`return jsonify(`
`22`	`22`	`{`