Reorganize some codes.

Xreki · Xreki · commit 1a3dc301d3cb · 2025-09-22T17:46:28.000+08:00
diff --git a/graph_net/collect_stats_util.py b/graph_net/collect_stats_util.py
@@ -0,0 +1,101 @@
+import ast
+import importlib
+import inspect
+from dataclasses import dataclass, field
+from typing import Dict
+
+
+@dataclass
+class OpStat:
+    op_name: str
+    op_dtypes: dict[str, int] = field(default_factory=dict)
+    count: int = 0
+
+    def update(self, other):
+        if isinstance(other, OpStat) and self.op_name == other.op_name:
+            self.count += other.count
+            for name, count in other.op_dtypes.items():
+                self.op_dtypes[name] = self.op_dtypes.get(name, 0) + count
+
+
+@dataclass
+class ModelStats:
+    model_path: str
+    num_inputs: int = None
+    num_params: int = None
+    num_outputs: int = None
+    num_ops: int = None
+    model_size_in_billion: float = None
+    input_dtypes: Dict[str, int] = field(default_factory=dict)
+    param_dtypes: Dict[str, int] = field(default_factory=dict)
+    op_dtypes: Dict[str, int] = field(default_factory=dict)
+    ops: Dict[str, int] = field(default_factory=dict)
+    source: str = None
+    heuristic_tag: str = None
+
+
+def print_model_stats(stats, log_prompt):
+    assert isinstance(stats, ModelStats), f"{type(stats)=}"
+
+    def dict_to_string(d):
+        kv_list = [f"{k}:{v}" for k, v in d.items()]
+        return " ".join(kv_list)
+
+    def print_with_log_prompt(key, value):
+        print(
+            f"{log_prompt} [ModelStats.{key}] model_path:{stats.model_path} {value}",
+            flush=True,
+        )
+
+    print_with_log_prompt("num_inputs", stats.num_inputs)
+    print_with_log_prompt("num_params", stats.num_params)
+    print_with_log_prompt("num_outputs", stats.num_outputs)
+    print_with_log_prompt("num_ops", stats.num_ops)
+    print_with_log_prompt("model_size", f"{stats.model_size_in_billion}B")
+    print_with_log_prompt("input_dtypes", dict_to_string(stats.input_dtypes))
+    print_with_log_prompt("param_dtypes", dict_to_string(stats.param_dtypes))
+    print_with_log_prompt("op_dtypes", dict_to_string(stats.op_dtypes))
+    print_with_log_prompt("ops", dict_to_string(stats.ops))
+    print_with_log_prompt("source", stats.source)
+    print_with_log_prompt("heuristic_tag", stats.heuristic_tag)
+
+
+def load_class_from_file(file_path, class_name):
+    spec = importlib.util.spec_from_file_location("unnamed", file_path)
+    unnamed = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(unnamed)
+    model_class = getattr(unnamed, class_name, None)
+    return model_class
+
+
+def get_argument_name_and_types(model_class, func_name):
+    argument_name2types = {}
+    for name, func in inspect.getmembers(model_class, predicate=inspect.isfunction):
+        if name == func_name:
+            for arg_name, arg in inspect.signature(func).parameters.items():
+                if arg_name != "self":
+                    argument_name2types[arg_name] = (
+                        None if arg.annotation is inspect._empty else arg.annotation
+                    )
+    return argument_name2types
+
+
+def get_number_of_returns(file_path, class_name, func_name):
+    source = None
+    with open(f"{file_path}", "r") as f:
+        source = f.read()
+
+    tree = ast.parse(source)
+    for node in tree.body:
+        if isinstance(node, ast.ClassDef) and node.name == class_name:
+            for f in node.body:
+                if isinstance(f, ast.FunctionDef) and f.name == func_name:
+                    for stmt in ast.walk(f):
+                        if isinstance(stmt, ast.Return):
+                            if stmt.value is None:
+                                return 0
+                            elif isinstance(stmt.value, ast.Tuple):
+                                return len(stmt.value.elts)
+                            else:
+                                return 1
+    return 0
diff --git a/graph_net/paddle/collect_stats.py b/graph_net/paddle/collect_stats.py
@@ -2,65 +2,19 @@
 import os
 import re
 import sys
-import ast
 import math
-import importlib
-import inspect
 import subprocess
 from datetime import datetime
-from typing import Type
-from dataclasses import dataclass, field
-from collections import defaultdict
 
 import paddle
+from graph_net import collect_stats_util
 from graph_net.paddle import utils
 
 
 def is_single_model_dir(model_dir):
     return os.path.isfile(f"{model_dir}/graph_net.json")
 
 
-def load_class_from_file(file_path: str, class_name: str) -> Type[paddle.nn.Layer]:
-    spec = importlib.util.spec_from_file_location("unnamed", file_path)
-    unnamed = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(unnamed)
-    model_class = getattr(unnamed, class_name, None)
-    return model_class
-
-
-def get_argument_name_and_types(model_class, func_name):
-    argument_name2types = {}
-    for name, func in inspect.getmembers(model_class, predicate=inspect.isfunction):
-        if name == func_name:
-            for arg_name, arg in inspect.signature(func).parameters.items():
-                if arg_name != "self":
-                    argument_name2types[arg_name] = (
-                        None if arg.annotation is inspect._empty else arg.annotation
-                    )
-    return argument_name2types
-
-
-def get_number_of_returns(file_path, class_name, func_name):
-    source = None
-    with open(f"{file_path}", "r") as f:
-        source = f.read()
-
-    tree = ast.parse(source)
-    for node in tree.body:
-        if isinstance(node, ast.ClassDef) and node.name == class_name:
-            for f in node.body:
-                if isinstance(f, ast.FunctionDef) and f.name == func_name:
-                    for stmt in ast.walk(f):
-                        if isinstance(stmt, ast.Return):
-                            if stmt.value is None:
-                                return 0
-                            elif isinstance(stmt.value, ast.Tuple):
-                                return len(stmt.value.elts)
-                            else:
-                                return 1
-    return 0
-
-
 def read_graph_source_and_tag(model_path):
     try:
         with open(os.path.join(model_path, "graph_net.json"), "r") as f:
@@ -87,19 +41,6 @@ def get_input_spec(model_path):
     return input_spec
 
 
-@dataclass
-class OpStat:
-    op_name: str
-    op_dtypes: dict[str, int] = field(default_factory=dict)
-    count: int = 0
-
-    def update(self, other):
-        if isinstance(other, OpStat) and self.op_name == other.op_name:
-            self.count += other.count
-            for name, count in other.op_dtypes.items():
-                self.op_dtypes[name] = self.op_dtypes.get(name, 0) + count
-
-
 class ProgramAnalyzer:
     def __init__(self):
         self.op_stats = {}
@@ -112,7 +53,9 @@ def update_op_stats(self, op_name, op_dtype):
         if op_name is not None:
             dtype_str = str(op_dtype).replace("paddle.", "")
             if self.op_stats.get(op_name, None) is None:
-                self.op_stats[op_name] = OpStat(op_name, {dtype_str: 1}, 1)
+                self.op_stats[op_name] = collect_stats_util.OpStat(
+                    op_name, {dtype_str: 1}, 1
+                )
             else:
                 self.op_stats[op_name].op_dtypes[dtype_str] = (
                     self.op_stats[op_name].op_dtypes.get(dtype_str, 0) + 1
@@ -213,9 +156,8 @@ def collect_op_stats(model, model_path):
 
 def collect_model_stats(model_path, log_prompt):
     file_path = os.path.join(model_path, "model.py")
-    model_class = load_class_from_file(file_path, "GraphModule")
+    model_class = collect_stats_util.load_class_from_file(file_path, "GraphModule")
     model = model_class()
-    num_outputs = get_number_of_returns(file_path, "GraphModule", "forward")
 
     model_size = 0
     input_dtypes = {}
@@ -244,39 +186,33 @@ def collect_model_stats(model_path, log_prompt):
             elif name in inputs.keys():
                 input_dtypes[dtype_str] = input_dtypes.get(dtype_str, 0) + 1
 
-    model_size_in_billion = model_size / 1e9
-    num_params = sum(param_dtypes.values())
-    num_inputs = sum(input_dtypes.values())
-    num_ops = sum(ops_count_dict.values())
+    num_outputs = collect_stats_util.get_number_of_returns(
+        file_path, "GraphModule", "forward"
+    )
+    num_ops = program_analyzer.num_ops if program_analyzer is not None else 0
     source, heuristic_tag = read_graph_source_and_tag(model_path)
-    method = "to_static"
     is_complete = (
         program_analyzer.is_complete if program_analyzer is not None else False
     )
+    print(
+        f"model_stats collection information: model_path={model_path}, method=to_static, is_ops_complete={is_complete}"
+    )
 
-    def dict_to_string(d):
-        kv_list = [f"{k}:{v}" for k, v in d.items()]
-        return " ".join(kv_list)
-
-    def print_with_log_prompt(key, value):
-        print(
-            f"{log_prompt} [ModelStats.{key}] model_path:{model_path} {value}",
-            flush=True,
-        )
-
-    print_with_log_prompt("num_inputs", num_inputs)
-    print_with_log_prompt("num_params", num_params)
-    print_with_log_prompt("num_outputs", num_outputs)
-    print_with_log_prompt("num_ops", num_ops)
-    print_with_log_prompt("model_size", f"{model_size_in_billion}B")
-    print_with_log_prompt("input_dtypes", dict_to_string(input_dtypes))
-    print_with_log_prompt("param_dtypes", dict_to_string(param_dtypes))
-    print_with_log_prompt("op_dtypes", dict_to_string(op_dtypes))
-    print_with_log_prompt("ops", dict_to_string(ops_count_dict))
-    print_with_log_prompt("source", source)
-    print_with_log_prompt("heuristic_tag", heuristic_tag)
-    print_with_log_prompt("method", method)
-    print_with_log_prompt("is_complete", is_complete)
+    stats = collect_stats_util.ModelStats(
+        model_path=model_path,
+        num_inputs=sum(input_dtypes.values()),
+        num_params=sum(param_dtypes.values()),
+        num_outputs=num_outputs,
+        num_ops=num_ops,
+        model_size_in_billion=model_size / 1e9,
+        input_dtypes=input_dtypes,
+        param_dtypes=param_dtypes,
+        op_dtypes=op_dtypes,
+        ops=ops_count_dict,
+        source=source,
+        heuristic_tag=heuristic_tag,
+    )
+    collect_stats_util.print_model_stats(stats, log_prompt)
 
 
 def main(args):
@@ -295,23 +231,9 @@ def main(args):
             else args.graph_net_samples_path
         )
 
-        previous_failed_model_pathes = []
-        if args.previous_collect_result_path is not None:
-            with open(args.previous_collect_result_path, "r") as f:
-                for line in f.readlines():
-                    if "[ModelStats]" in line:
-                        fields = line.strip().split()
-                        model_path = fields[2].split(":")[-1]
-                        is_complete = fields[-1].split(":")[-1]
-                        if is_complete == "False":
-                            previous_failed_model_pathes.append(model_path)
-
         i = 0
         for root, dirs, files in os.walk(graph_net_samples_path):
-            if is_single_model_dir(root) and (
-                args.previous_collect_result_path is None
-                or root in previous_failed_model_pathes
-            ):
+            if is_single_model_dir(root):
                 print(f"[{i}] Collect information for {root}")
                 cmd = [
                     "python",
@@ -359,13 +281,6 @@ def main(args):
         default=None,
         help="GraphNet samples directory. e.g '../../paddle_samples'",
     )
-    parser.add_argument(
-        "--previous-collect-result-path",
-        type=str,
-        required=False,
-        default=None,
-        help="Previous collect result path, use to recollect the failed cases",
-    )
     parser.add_argument(
         "--log-prompt",
         type=str,