Merge branch 'develop' of github.com:PaddlePaddle/GraphNet into develop

lixinqi · lixinqi · commit 528d46c32a0b · 2025-11-10T02:43:42.000Z
diff --git a/README.md b/README.md
@@ -96,6 +96,9 @@ python -m graph_net.plot_violin \
 
 The scripts are designed to process a file structure as `/benchmark_path/category_name/`, and items on x-axis are identified by name of the sub-directories. After executing, several summary plots of result in categories (model tasks, libraries...) will be exported to `$GRAPH_NET_BENCHMARK_PATH`.
 
+### Hardware Regression Testing
+We also provide a two-step workflow that validates compiler correctness and performance against a "golden" reference, which is crucial for hardware-specific testing and regression tracking. Details can be found in this [guide](./docs/hardware_test.md).
+
 ### 🧱 Construction & Contribution Guide
 Want to understand how GraphNet is built or contribute new samples?
 Check out the [Construction Guide](./docs/README_contribute.md) for details on the extraction and validation workflow.
diff --git a/docs/hardware_test.md b/docs/hardware_test.md
@@ -0,0 +1,20 @@
+## Hardware Regression Testing
+### Step 1: Generate Reference Data
+First, use `graph_net.paddle.test_reference_device` on a trusted setting (e.g., a specific hardware/compiler version) to generate baseline logs and output files.
+```bash
+python -m graph_net.paddle.test_reference_device \
+    --model-path /path/to/all_models/ \
+    --reference-dir ./gold_reference \
+    --compiler cinn \
+    --device cuda
+# --reference-dir: (Required) Directory where the output .log (performance/config) and .pdout (output tensors) files will be saved.
+# --compiler: Specifies the compiler backend.
+```
+### Step 2: Run Regression Test
+After changing hardware, run the correctness test script. This script reads the reference data, re-runs the models using the exact same configuration, and compares the new results against the "golden" reference.
+```bash
+python -m graph_net.paddle.test_device_correctness \
+    --reference-dir ./golden_reference \
+    --device cuda
+```
+This script will report any failures (e.g., compilation errors, output mismatches) and print a performance comparison (speedup/slowdown) against the reference log, allowing you to quickly identify regressions.
diff --git a/graph_net/imp_util.py b/graph_net/imp_util.py
@@ -0,0 +1,8 @@
+import importlib.util as imp
+
+
+def load_module(path, name="unamed"):
+    spec = imp.spec_from_file_location(name, path)
+    module = imp.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
diff --git a/graph_net/test/chain_naive_graph_decomposer_test.sh b/graph_net/test/chain_naive_graph_decomposer_test.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
+os.path.dirname(graph_net.__file__))")
+
+# input model path
+MODEL_PATH_IN_SAMPLES=/timm/resnet18 
+read -r -d '' extractor_config_json_str <<EOF
+{
+    "custom_extractor_path": "$GRAPH_NET_ROOT/torch/naive_graph_decomposer.py",
+    "custom_extractor_config": {
+        "output_dir": "/tmp/naive_decompose_workspace",
+        "split_positions": [8, 16, 32],
+        "group_head_and_tail": true,
+        "chain_style": true
+    }
+}
+EOF
+EXTRACTOR_CONFIG=$(echo $extractor_config_json_str | base64 -w 0)
+
+mkdir -p /tmp/naive_decompose_workspace
+python3 -m graph_net.torch.single_device_runner --model-path $GRAPH_NET_ROOT/../samples/$MODEL_PATH_IN_SAMPLES --enable-extract True --extract-name resnet18 --dump-graph-hash-key --extractor-config=$EXTRACTOR_CONFIG
diff --git a/graph_net/test/naive_graph_decomposer_test.sh b/graph_net/test/naive_graph_decomposer_test.sh
@@ -1,13 +1,23 @@
 #!/bin/bash
-# input model path
-MODEL_PATH_IN_SAMPLES=/timm/resnet18 
-# output model path
-OUTPUT_DIR=/tmp/naive_decompose_workspace
 
-mkdir -p $OUTPUT_DIR
-# extract subgraph 0-8, 8-16
-export GRAPH_NET_NAIVE_DECOMPOSER_SPLIT_POS=0,8,16
-export GRAPH_NET_EXTRACT_WORKSPACE=$OUTPUT_DIR 
 GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
 os.path.dirname(graph_net.__file__))")
-python3 -m graph_net.torch.single_device_runner --model-path $GRAPH_NET_ROOT/../samples/$MODEL_PATH_IN_SAMPLES --enable-extract True --extract-name resnet18 --dump-graph-hash-key --custom-extractor-path=$GRAPH_NET_ROOT/torch/naive_graph_decomposer.py
+
+# input model path
+MODEL_PATH_IN_SAMPLES=/timm/resnet18 
+read -r -d '' extractor_config_json_str <<EOF
+{
+    "custom_extractor_path": "$GRAPH_NET_ROOT/torch/naive_graph_decomposer.py",
+    "custom_extractor_config": {
+        "output_dir": "/tmp/naive_decompose_workspace",
+        "split_positions": [8, 32],
+        "group_head_and_tail": true,
+        "filter_path":"$GRAPH_NET_ROOT/torch/naive_subgraph_filter.py",
+        "filter_config": {}
+    }
+}
+EOF
+EXTRACTOR_CONFIG=$(echo $extractor_config_json_str | base64 -w 0)
+
+mkdir -p /tmp/naive_decompose_workspace
+python3 -m graph_net.torch.single_device_runner --model-path $GRAPH_NET_ROOT/../samples/$MODEL_PATH_IN_SAMPLES --enable-extract True --extract-name resnet18 --dump-graph-hash-key --extractor-config=$EXTRACTOR_CONFIG
diff --git a/graph_net/torch/backend/unstable_to_stable_backend.py b/graph_net/torch/backend/unstable_to_stable_backend.py
@@ -126,6 +126,27 @@ def _impl_unstable_to_stable_fftn(self, gm):
 
         return gm
 
+    def _impl_unstable_to_stable_special_logit(self, gm):
+        """
+        Convert torch._C._special.special_logit to torch.special.logit
+        """
+        issue_nodes = (
+            node
+            for node in gm.graph.nodes
+            if node.op == "call_function"
+            if hasattr(node.target, "__module__")
+            if node.target.__module__ == "torch._C._special"
+            if hasattr(node.target, "__name__")
+            if node.target.__name__ == "special_logit"
+        )
+        for node in issue_nodes:
+            node.target = torch.special.logit
+
+        # Recompile the graph
+        gm.recompile()
+
+        return gm
+
     def unstable_to_stable(self, gm):
         methods = (
             name
diff --git a/graph_net/torch/decompose_util.py b/graph_net/torch/decompose_util.py
@@ -9,9 +9,13 @@ def convert_to_submodules_graph(
     original_gm: torch.fx.GraphModule,
     split_positions: list[int],
     submodule_hook=None,
-    submodule_name_prefix="extraced_submodule",
+    submodule_name_prefix="extracted_submodule",
+    chain_style=False,
     group_head_and_tail=True,
 ):
+    """
+    chain_style=True: decompose original_gm into g0 * g1 * g2 * g3
+    """
     original_gm = copy.deepcopy(original_gm)
     num_placeholders = len(
         [node for node in original_gm.graph.nodes if node.op == "placeholder"]
@@ -68,14 +72,22 @@ def get_end_node_idx(range_idx):
                 return i + 1
         raise NotImplementedError("Dead code.")
 
+    def print_submodule_call(prompt, gm):
+        submodule_call_stmts = [
+            stmt for stmt in gm.code.split("\n") if "self.extracted_submodule" in stmt
+        ]
+        print(f"{prompt} ", submodule_call_stmts)
+
     for range_idx in range(len(range_idx2submodule_body_nodes)):
         (
             submodule_input_nodes,
             submodule_output_nodes,
+            identity_nodes,
         ) = _get_submodule_inputs_and_outputs(
             original_gm=original_gm,
             start_node_idx=get_start_node_idx(range_idx),
             end_node_idx=get_end_node_idx(range_idx),
+            chain_style=chain_style,
         )
 
         def get_input_nodes(range_idx):
@@ -130,15 +142,22 @@ def get_output_nodes(range_idx):
                 prev_node = new_output_node
 
         # Replace all use of outputs
+        identity_node_set = set(identity_nodes)
         for original_output in get_output_nodes(range_idx):
-            original_output.replace_all_uses_with(node_map[original_output])
+            if original_output not in identity_node_set:
+                original_output.replace_all_uses_with(node_map[original_output])
 
         # Erase old nodes
         for node in reversed(get_body_nodes(range_idx)):
             original_gm.graph.erase_node(node)
+        # print_submodule_call("(fx) after Erase old nodes", original_gm)
+
+    # print_submodule_call("(fx) before recompile", original_gm)
 
     original_gm.recompile()
 
+    # print_submodule_call("(fx) after recompile", original_gm)
+
     return original_gm
 
 
@@ -147,7 +166,7 @@ def fold_range_to_submodule(
     start_node_idx: int,
     end_node_idx: int,
     submodule_hook=None,
-    submodule_name="extraced_submodule",
+    submodule_name="extracted_submodule",
     group_head_and_tail=True,
 ):
     return convert_to_submodules_graph(
@@ -170,6 +189,7 @@ def _get_submodule_inputs_and_outputs(
     original_gm: torch.fx.GraphModule,
     start_node_idx: int,
     end_node_idx: int,
+    chain_style=False,
 ):
     count_ctx = NodeProducedOrConsumedCountCtx(
         defaultdict(int),
@@ -179,7 +199,11 @@ def _get_submodule_inputs_and_outputs(
     node_list = list(original_gm.graph.nodes)
 
     def get_related_node(node):
-        yield from node.args
+        for arg in node.args:
+            if isinstance(arg, tuple):
+                yield from arg
+            else:
+                yield arg
         yield node
 
     for node in node_list[0:start_node_idx]:
@@ -200,13 +224,32 @@ def get_related_node(node):
         if count_ctx.node2before_input[node] > 0
         if count_ctx.node2body[node] > 0
     ]
-
     output_nodes = [
         node
         for node in node_list
         if not (count_ctx.node2before_input[node] > 0)
         if count_ctx.node2body[node] > 0
         if count_ctx.node2after_output[node] > 0
     ]
-
-    return input_nodes, output_nodes
+    if not chain_style:
+        identity_nodes = []
+    else:
+        identity_nodes = [
+            node
+            for node in node_list
+            if count_ctx.node2before_input[node] > 0
+            if count_ctx.node2body[node] == 0
+            if count_ctx.node2after_output[node] > 0
+        ][:1]
+        input_nodes_set = set(input_nodes)
+        input_nodes = [
+            *input_nodes,
+            *[node for node in identity_nodes if node not in input_nodes_set],
+        ]
+        output_nodes_set = set(output_nodes)
+        output_nodes = [
+            *output_nodes,
+            *[node for node in identity_nodes if node not in output_nodes_set],
+        ]
+
+    return input_nodes, output_nodes, identity_nodes
diff --git a/graph_net/torch/extractor.py b/graph_net/torch/extractor.py
@@ -15,14 +15,23 @@
 
 class GraphExtractor:
     def __init__(
-        self, name, dynamic, mut_graph_codes=None, placeholder_auto_rename=False
+        self,
+        name,
+        dynamic,
+        mut_graph_codes=None,
+        placeholder_auto_rename=False,
+        workspace_path=None,
     ):
         self.subgraph_counter = 0
         self.name = name
         self.dynamic = dynamic
         self.mut_graph_codes = mut_graph_codes
         self.placeholder_auto_rename = placeholder_auto_rename
-        self.workspace_path = os.environ.get("GRAPH_NET_EXTRACT_WORKSPACE")
+        self.workspace_path = (
+            workspace_path
+            if workspace_path is not None
+            else os.environ.get("GRAPH_NET_EXTRACT_WORKSPACE")
+        )
         if not self.workspace_path:
             raise EnvironmentError(
                 "Environment variable 'GRAPH_NET_EXTRACT_WORKSPACE' is not set."
@@ -130,7 +139,7 @@ def extract(
     dynamic=True,
     mut_graph_codes=None,
     placeholder_auto_rename=False,
-    custom_extractor_path=None,
+    extractor_config: dict = None,
 ):
     """
     Extract computation graphs from PyTorch nn.Module.
@@ -200,19 +209,24 @@ def forward(self, s0 : torch.SymInt, L_x_ : torch.Tensor):
         >>>
     """
 
-    def get_graph_extractor_cls():
+    extractor_config = make_extractor_config(extractor_config)
+
+    def get_graph_extractor_maker():
+        custom_extractor_path = extractor_config["custom_extractor_path"]
+        custom_extractor_config = extractor_config["custom_extractor_config"]
         if custom_extractor_path is None:
             return GraphExtractor
         import importlib.util as imp
 
         spec = imp.spec_from_file_location("graph_extractor", custom_extractor_path)
         graph_extractor = imp.module_from_spec(spec)
         spec.loader.exec_module(graph_extractor)
-        return graph_extractor.GraphExtractor
+        cls = graph_extractor.GraphExtractor
+        return lambda *args, **kwargs: cls(custom_extractor_config, *args, **kwargs)
 
     def wrapper(model: torch.nn.Module):
         assert isinstance(model, torch.nn.Module), f"{type(model)=}"
-        extractor = get_graph_extractor_cls()(
+        extractor = get_graph_extractor_maker()(
             name, dynamic, mut_graph_codes, placeholder_auto_rename
         )
         # return torch.compile(backend=extractor, dynamic=dynamic)
@@ -236,3 +250,18 @@ def decorator_or_wrapper(obj):
             )
 
     return decorator_or_wrapper
+
+
+def make_extractor_config(extractor_config):
+    kwargs = extractor_config if extractor_config is not None else {}
+    return make_extractor_config_impl(**kwargs)
+
+
+def make_extractor_config_impl(
+    custom_extractor_path: str = None, custom_extractor_config: dict = None
+):
+    config = custom_extractor_config if custom_extractor_config is not None else {}
+    return {
+        "custom_extractor_path": custom_extractor_path,
+        "custom_extractor_config": config,
+    }
diff --git a/graph_net/torch/fx_graph_serialize_util.py b/graph_net/torch/fx_graph_serialize_util.py
@@ -24,6 +24,7 @@ def serialize_graph_module_to_str(gm: torch.fx.GraphModule) -> str:
         (r"torch\._C\._fft\.fft_irfft\(", "torch.fft.irfft("),
         (r"torch\._C\._fft\.fft_rfft\(", "torch.fft.rfft("),
         (r"torch\._C\._fft\.fft_fftn\(", "torch.fft.fftn("),
+        (r"torch\._C\._special\.special_logit\(", "torch.special.logit("),
         # Add new rules to this list as needed
     ]
     for pattern, repl in replacements:
diff --git a/graph_net/torch/naive_graph_decomposer.py b/graph_net/torch/naive_graph_decomposer.py
diff --git a/graph_net/torch/naive_subgraph_filter.py b/graph_net/torch/naive_subgraph_filter.py
diff --git a/graph_net/torch/single_device_runner.py b/graph_net/torch/single_device_runner.py

Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@ def serialize_graph_module_to_str(gm: torch.fx.GraphModule) -> str:`
`24`	`24`	`(r"torch\._C\._fft\.fft_irfft\(", "torch.fft.irfft("),`
`25`	`25`	`(r"torch\._C\._fft\.fft_rfft\(", "torch.fft.rfft("),`
`26`	`26`	`(r"torch\._C\._fft\.fft_fftn\(", "torch.fft.fftn("),`
	`27`	`+ (r"torch\._C\._special\.special_logit\(", "torch.special.logit("),`
`27`	`28`	`# Add new rules to this list as needed`
`28`	`29`	`]`
`29`	`30`	`for pattern, repl in replacements:`