Add the definition of ModelRecord to refactor some codes.

Xreki · Xreki · commit e7deedca7540 · 2025-12-15T23:01:39.000+08:00
diff --git a/graph_net/subgraph_decompose_and_evaluation_step.py b/graph_net/subgraph_decompose_and_evaluation_step.py
@@ -12,7 +12,7 @@
 from graph_net.analysis_util import get_incorrect_models
 from graph_net import path_utils
 
-kMaxGraphSize = 4096
+MAX_GRAPH_SIZE = 4096
 
 
 def convert_b64_string_to_json(b64str):
@@ -109,9 +109,30 @@ def _print(self):
         print()
 
 
+@dataclass
+class ModelRecord:
+    original_path: str
+    uniform_split_positions: List[int] = field(default_factory=list)
+    subgraph_paths: List[str] = field(default_factory=list)
+    incorrect_subgraph_idxs: List[int] = field(default_factory=list)
+
+    def get_split_positions(self, decompose_method):
+        if decompose_method == "fixed-start":
+            assert (
+                len(self.uniform_split_positions) >= 2
+            ), f"{self.uniform_split_positions=}"
+            return [0, self.uniform_split_positions[1]]
+        return self.uniform_split_positions
+
+    def update_for_next_decompose(self, subgraph_idx, max_subgraph_size):
+        self.uniform_split_positions = reconstruct_split_positions_for_subgraphs(
+            self.uniform_split_positions, subgraph_idx, max_subgraph_size
+        )
+
+
 @dataclass
 class DecomposeConfig:
-    method: str
+    decompose_method: str
     tolerance: int | List[int]
     max_subgraph_size: int = -1
     tasks_map: Dict[str, Union[int, str, list, dict]] = field(default_factory=dict)
@@ -145,18 +166,28 @@ def get_incorrect_models(self, pass_id):
         assert pass_key in self.running_states
         return self.running_states[pass_key]["incorrect_models"]
 
-    def update_running_states(self, pass_id, **kwargs):
-        pass_key = get_pass_name(pass_id)
-        if self.running_states.get(pass_key, None) is None:
+    def update_running_states(self, pass_id, incorrect_models, model_name2record):
+        assert pass_id == "initial" or isinstance(pass_id, int)
+        pass_key = get_pass_name(pass_id) if isinstance(pass_id, int) else pass_id
+        if pass_key not in self.running_states:
             self.running_states[pass_key] = {}
 
-        for key, value in kwargs.items():
-            assert key in [
-                "num_incorrect_models",
-                "incorrect_models",
-                "failed_decomposition_models",
-            ]
-            self.running_states[pass_key][key] = value
+        self.running_states[pass_key]["incorrect_models_from_log"] = list(
+            sorted(incorrect_models)
+        )
+        if model_name2record:
+            target_model_names = list(model_name2record.keys())
+            model_name2subgraph_idxs = collect_incorrect_subgraph_idxs(
+                self.decompose_method,
+                target_model_names,
+                incorrect_models,
+                model_name2record,
+            )
+            for model_name, model_record in sorted(model_name2record.items()):
+                model_record.incorrect_subgraph_idxs = model_name2subgraph_idxs[
+                    model_name
+                ]
+                self.running_states[pass_key][model_name] = model_record.__dict__
 
 
 def get_rectfied_model_path(model_path):
@@ -226,21 +257,18 @@ def run_decomposer_for_single_model(
 
 
 def run_decomposer_for_multi_models(
-    framework, tasks_map, decomposed_samples_dir, max_subgraph_size, log_path
+    framework, model_name2record, decomposed_samples_dir, max_subgraph_size, log_path
 ):
-    failed_decomposition = []
+    failed_decomposition_models = []
 
     print(
         f"[Decomposition] max_subgraph_size: {max_subgraph_size}, log_path: {log_path}"
     )
-    for model_name, task_info in tasks_map.items():
-        original_path = task_info["original_path"]
-        split_positions = sorted(list(task_info["split_positions"]))
-
-        method = "fixed-start"
-        if method == "fixed-start":
-            assert len(split_positions) >= 3, f"{split_positions=}"
-            split_positions = [0, split_positions[1]]
+    for model_name, model_record in model_name2record.items():
+        original_path = model_record.original_path
+        split_positions = model_record.get_split_positions(
+            decompose_method="fixed-start"
+        )
 
         rectified_model_path = get_rectfied_model_path(original_path)
         assert os.path.exists(
@@ -255,8 +283,8 @@ def run_decomposer_for_multi_models(
             log_path,
         )
         if not success:
-            failed_decomposition.append(rectified_model_path)
-    return tasks_map, failed_decomposition
+            failed_decomposition_models.append(rectified_model_path)
+    return failed_decomposition_models
 
 
 def run_evaluation(
@@ -314,10 +342,13 @@ def generate_initial_tasks(args):
     initial_failures = get_ranged_incorrect_models(args.tolerance, args.log_file)
 
     tasks_map = {}
-    max_subgraph_size = min(args.max_subgraph_size, kMaxGraphSize // 2)
+    if args.decompose_method == "fixed-start":
+        max_subgraph_size = MAX_GRAPH_SIZE
+    else:
+        max_subgraph_size = min(args.max_subgraph_size, MAX_GRAPH_SIZE)
 
     initial_split_positions = reconstruct_split_positions_for_subgraphs(
-        [0, kMaxGraphSize], 0, max_subgraph_size
+        [0, MAX_GRAPH_SIZE], 0, max_subgraph_size
     )
     for model_path in initial_failures:
         model_name = get_model_name_with_subgraph_tag(model_path)
@@ -327,7 +358,7 @@ def generate_initial_tasks(args):
         }
 
     running_states = {
-        "pass_0": {
+        "initial": {
             "num_incorrect_models": len(initial_failures),
             "incorrect_models": list(sorted(initial_failures)),
         }
@@ -343,7 +374,9 @@ def extract_model_name_and_subgraph_idx(subgraph_path):
     return model_name, subgraph_idx
 
 
-def collect_incorrect_subgraph_idxs(args, target_model_names, incorrect_models):
+def collect_incorrect_subgraph_idxs(
+    decompose_method, target_model_names, incorrect_models, model_name2record
+):
     model_name2subgraph_idxs = {}
     for subgraph_path in sorted(incorrect_models):
         model_name, subgraph_idx = extract_model_name_and_subgraph_idx(subgraph_path)
@@ -355,11 +388,17 @@ def collect_incorrect_subgraph_idxs(args, target_model_names, incorrect_models):
             model_name2subgraph_idxs[model_name] = []
         model_name2subgraph_idxs[model_name].append(subgraph_idx)
 
-    if args.method == "fixed-start":
+    if decompose_method == "fixed-start":
         print(model_name2subgraph_idxs)
         for model_name in target_model_names:
             if model_name not in model_name2subgraph_idxs:
-                model_name2subgraph_idxs[model_name] = [1]
+                if (
+                    model_name2record
+                    and len(model_name2record[model_name].uniform_split_positions) > 2
+                ):
+                    model_name2subgraph_idxs[model_name] = [1]
+                else:
+                    model_name2subgraph_idxs[model_name] = []
             else:
                 assert len(
                     model_name2subgraph_idxs[model_name]
@@ -375,15 +414,15 @@ def generate_successor_tasks(args, base_output_dir, current_pass_id):
     prev_config = DecomposeConfig.load(prev_pass_dir)
     max_subgraph_size = prev_config.max_subgraph_size // 2
     incorrect_models = prev_config.get_incorrect_models(current_pass_id)
-    if args.method != "fixed-start" and not incorrect_models:
+    if args.decompose_method != "fixed-start" and not incorrect_models:
         return {}, max_subgraph_size, prev_config.running_states
 
     tasks_map = {}
     prev_tasks_map = prev_config.tasks_map
 
     target_model_names = list(prev_tasks_map.keys())
     model_name2subgraph_idxs = collect_incorrect_subgraph_idxs(
-        args, target_model_names, incorrect_models
+        args.decompose_method, target_model_names, incorrect_models, None
     )
 
     for model_name, subgraph_idxs in model_name2subgraph_idxs.items():
@@ -393,6 +432,8 @@ def generate_successor_tasks(args, base_output_dir, current_pass_id):
         split_positions = reconstruct_split_positions_for_subgraphs(
             prev_split_positions, subgraph_idxs, max_subgraph_size
         )
+        if args.decompose_method == "fixed-start" and len(split_positions) > 3:
+            split_positions = split_positions[0:3]
 
         tasks_map[model_name] = {
             "original_path": pre_task_for_model["original_path"],
@@ -430,58 +471,76 @@ def prepare_tasks_and_verify(args, current_pass_id, base_output_dir):
     return tasks_map, max_subgraph_size, running_states
 
 
-def execute_decomposition_phase(max_subgraph_size, tasks_map, framework, workspace):
+def collect_decomposed_subgraphs(model_name2record, decomposed_samples_dir):
+    for root, dirs, files in os.walk(decomposed_samples_dir):
+        if path_utils.is_single_model_dir(root):
+            model_name, _ = extract_model_name_and_subgraph_idx(root)
+            assert model_name in model_name2record
+            model_record = model_name2record[model_name]
+            model_record.subgraph_paths.append(root)
+    return model_name2record
+
+
+def execute_decomposition_phase(
+    max_subgraph_size, model_name2record, framework, workspace
+):
     """Executes the decomposition phase."""
 
-    failed_decomposition = []
-    need_decompose = True if len(tasks_map) > 0 else False
-    method = "fixed-start"
+    failed_decomposition_models = []
+    need_decompose = True if len(model_name2record) > 0 else False
+    decompose_method = "fixed-start"
+    decomposed_samples_dir = os.path.join(
+        workspace, "samples" if framework == "torch" else "paddle_samples"
+    )
 
     while need_decompose:
-        decomposed_samples_dir = os.path.join(
-            workspace, "samples" if framework == "torch" else "paddle_samples"
-        )
         if not os.path.exists(decomposed_samples_dir):
             os.makedirs(decomposed_samples_dir, exist_ok=True)
             print(f"[Decomposition] decomposed_samples_dir: {decomposed_samples_dir}")
 
         log_path = os.path.join(
             workspace, f"log_decompose-max_subgraph_size_{max_subgraph_size}.txt"
         )
-        tasks_map, failed_decomposition = run_decomposer_for_multi_models(
-            framework, tasks_map, decomposed_samples_dir, max_subgraph_size, log_path
+        failed_decomposition_models = run_decomposer_for_multi_models(
+            framework,
+            model_name2record,
+            decomposed_samples_dir,
+            max_subgraph_size,
+            log_path,
         )
         num_decomposed_samples = count_samples(decomposed_samples_dir)
         print(
-            f"[Decomposition] number of graphs: {len(tasks_map)} -> {num_decomposed_samples}",
+            f"[Decomposition] number of graphs: {len(model_name2record)} -> {num_decomposed_samples}",
             flush=True,
         )
         if (
-            not failed_decomposition
-            and num_decomposed_samples == len(tasks_map)
+            not failed_decomposition_models
+            and num_decomposed_samples == len(model_name2record)
             and max_subgraph_size > 1
-            and method != "fixed-start"
+            and decompose_method != "fixed-start"
         ):
             need_decompose = True
             shutil.rmtree(decomposed_samples_dir)
             os.makedirs(decomposed_samples_dir, exist_ok=True)
             max_subgraph_size = max(1, max_subgraph_size // 2)
-            for model_name, task_info in tasks_map.items():
-                split_positions = task_info["split_positions"]
-                if not split_positions or len(split_positions) < 2:
+            for model_name, model_record in model_name2record.items():
+                if (
+                    not model_record.uniform_split_positions
+                    or len(model_record.uniform_split_positions) < 2
+                ):
                     continue
-                new_split_positions = reconstruct_split_positions_for_subgraphs(
-                    split_positions, 0, max_subgraph_size
-                )
-                task_info["split_positions"] = new_split_positions
+                model_record.update_for_next_decompose(0, max_subgraph_size)
         else:
             need_decompose = False
         print()
 
-    if failed_decomposition:
-        print(f"[WARN] {len(failed_decomposition)} models failed to decompose.")
+    if failed_decomposition_models:
+        print(f"[WARN] {len(failed_decomposition_models)} models failed to decompose.")
 
-    return tasks_map, failed_decomposition, max_subgraph_size
+    model_name2record = collect_decomposed_subgraphs(
+        model_name2record, decomposed_samples_dir
+    )
+    return model_name2record, max_subgraph_size
 
 
 def count_unique_original_models(incorrect_models):
@@ -518,8 +577,16 @@ def main(args):
     tasks_map, max_subgraph_size, running_states = prepare_tasks_and_verify(
         args, current_pass_id, base_output_dir
     )
+
+    model_name2record = {}
+    for model_name in tasks_map.keys():
+        model_name2record[model_name] = ModelRecord(
+            original_path=tasks_map[model_name]["original_path"],
+            uniform_split_positions=tasks_map[model_name]["split_positions"],
+        )
+
     decompose_config = DecomposeConfig(
-        method=args.method,
+        decompose_method=args.decompose_method,
         tolerance=args.tolerance,
         max_subgraph_size=max_subgraph_size,
         tasks_map=tasks_map,
@@ -533,14 +600,10 @@ def main(args):
     if task_controller.task_scheduler["run_decomposer"]:
         print("\n--- Phase 1: Decomposition ---", flush=True)
         (
-            tasks_map,
-            failed_decomposition,
+            model_name2record,
             max_subgraph_size,
         ) = execute_decomposition_phase(
-            max_subgraph_size, tasks_map, args.framework, work_dir
-        )
-        decompose_config.update_running_states(
-            current_pass_id, failed_decomposition_models=list(failed_decomposition)
+            max_subgraph_size, model_name2record, args.framework, work_dir
         )
     else:
         print("\n--- Phase 1: Decomposition (skipped) ---", flush=True)
@@ -560,22 +623,26 @@ def main(args):
         print(f"\n--- Phase 3: Analysis (torlance={tolerance}) ---")
         next_pass_incorrect_models = sorted(get_incorrect_models(tolerance, log_path))
         num_original_models = count_unique_original_models(next_pass_incorrect_models)
+
         decompose_config.update_running_states(
-            current_pass_id + 1,
-            num_incorrect_models=num_original_models,
-            incorrect_models=list(next_pass_incorrect_models),
+            current_pass_id,
+            next_pass_incorrect_models,
+            model_name2record,
         )
 
         print(
             f"[Analysis] Found {len(next_pass_incorrect_models)} incorrect subgraphs ({num_original_models} original models)."
         )
         for idx, model_path in enumerate(next_pass_incorrect_models):
             print(f"- [{idx}] {model_path}")
+
         print_summary_and_suggestion(
             args, next_pass_incorrect_models, max_subgraph_size
         )
 
     # --- Step 5: Save States ---
+    for model_name, model_record in model_name2record.items():
+        print(f"- {model_name}: {model_record}")
     decompose_config.save(work_dir)
 
 
@@ -587,7 +654,7 @@ def main(args):
     parser.add_argument(
         "--test-config", type=str, required=True, help="Base64 encoded test config"
     )
-    parser.add_argument("--method", type=str, required=True)
+    parser.add_argument("--decompose-method", type=str, required=True)
     parser.add_argument(
         "--tolerance",
         type=int,