PaddlePaddle
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/hardware_test.md‎
Lines changed: 20 additions & 0 deletions b/‎docs/hardware_test.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎graph_net/test/chain_naive_graph_decomposer_test.sh‎
Lines changed: 20 additions & 0 deletions b/‎graph_net/test/chain_naive_graph_decomposer_test.sh‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎graph_net/test/naive_graph_decomposer_test.sh‎
Lines changed: 17 additions & 0 deletions b/‎graph_net/test/naive_graph_decomposer_test.sh‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎graph_net/test/torch_extractor_test.py‎
Lines changed: 8 additions & 21 deletions b/‎graph_net/test/torch_extractor_test.py‎
Lines changed: 8 additions & 21 deletions
@@ -96,6 +96,9 @@ python -m graph_net.plot_violin \
 
 The scripts are designed to process a file structure as `/benchmark_path/category_name/`, and items on x-axis are identified by name of the sub-directories. After executing, several summary plots of result in categories (model tasks, libraries...) will be exported to `$GRAPH_NET_BENCHMARK_PATH`.
 
+### Hardware Regression Testing
+We also provide a two-step workflow that validates compiler correctness and performance against a "golden" reference, which is crucial for hardware-specific testing and regression tracking. Details can be found in this [guide](./docs/hardware_test.md).
+
 ### 🧱 Construction & Contribution Guide
 Want to understand how GraphNet is built or contribute new samples?
 Check out the [Construction Guide](./docs/README_contribute.md) for details on the extraction and validation workflow.
 
@@ -0,0 +1,20 @@
+## Hardware Regression Testing
+### Step 1: Generate Reference Data
+First, use `graph_net.paddle.test_reference_device` on a trusted setting (e.g., a specific hardware/compiler version) to generate baseline logs and output files.
+```bash
+python -m graph_net.paddle.test_reference_device \
+    --model-path /path/to/all_models/ \
+    --reference-dir ./gold_reference \
+    --compiler cinn \
+    --device cuda
+# --reference-dir: (Required) Directory where the output .log (performance/config) and .pdout (output tensors) files will be saved.
+# --compiler: Specifies the compiler backend.
+```
+### Step 2: Run Regression Test
+After changing hardware, run the correctness test script. This script reads the reference data, re-runs the models using the exact same configuration, and compares the new results against the "golden" reference.
+```bash
+python -m graph_net.paddle.test_device_correctness \
+    --reference-dir ./golden_reference \
+    --device cuda
+```
+This script will report any failures (e.g., compilation errors, output mismatches) and print a performance comparison (speedup/slowdown) against the reference log, allowing you to quickly identify regressions.
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -x
+
+# input model path
+MODEL_PATH_IN_SAMPLES=/timm/resnet18 
+# extract subgraph 0-8, 8-16
+read -r -d '' json_str <<'EOF'
+{
+    "output_dir": "/tmp/naive_decompose_workspace",
+    "split_positions": [8, 16, 32],
+    "group_head_and_tail": true,
+    "chain_style": true
+}
+EOF
+CONFIG=$(echo $json_str | base64 -w 0) 
+
+mkdir -p /tmp/naive_decompose_workspace
+GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
+os.path.dirname(graph_net.__file__))")
+python3 -m graph_net.torch.single_device_runner --model-path $GRAPH_NET_ROOT/../samples/$MODEL_PATH_IN_SAMPLES --enable-extract True --extract-name resnet18 --dump-graph-hash-key --custom-extractor-path=$GRAPH_NET_ROOT/torch/naive_graph_decomposer.py --custom-extractor-config=$CONFIG
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# input model path
+MODEL_PATH_IN_SAMPLES=/timm/resnet18 
+read -r -d '' json_str <<'EOF'
+{
+    "output_dir": "/tmp/naive_decompose_workspace",
+    "split_positions": [8, 32],
+    "group_head_and_tail": true
+}
+EOF
+CONFIG=$(echo $json_str | base64 -w 0) 
+
+mkdir -p /tmp/naive_decompose_workspace
+GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
+os.path.dirname(graph_net.__file__))")
+python3 -m graph_net.torch.single_device_runner --model-path $GRAPH_NET_ROOT/../samples/$MODEL_PATH_IN_SAMPLES --enable-extract True --extract-name resnet18 --dump-graph-hash-key --custom-extractor-path=$GRAPH_NET_ROOT/torch/naive_graph_decomposer.py --custom-extractor-config=$CONFIG
@@ -19,36 +19,22 @@ def forward(self, x):
 
 
 class WrapperModule(torch.nn.Module):
-    def __init__(self, submodule):
+    def __init__(self, submodule, seq_no):
         super().__init__()
         self.submodule = submodule
+        self.seq_no = seq_no
 
     def forward(self, *args):
         print("Args:")
         print(args)
         return self.submodule(*args)
 
 
-def submodule_hook(submodule: torch.fx.GraphModule):
-    print(f"{'-'*8} [submodule] {'-'*8}\n")
+def submodule_hook(submodule: torch.fx.GraphModule, seq_no):
+    print(f"{'-'*8} [submodule-{seq_no}] {'-'*8}\n")
     print(submodule.graph)
-    """
-    graph():
-        %add : [num_users=1] = placeholder[target=add]
-        %mul : [num_users=1] = call_function[target=operator.mul](args = (%add, 2), kwargs = {})
-        %clamp : [num_users=1] = call_method[target=clamp](args = (%mul,), kwargs = {min: 0.0, max: 1.0})
-        return (clamp,)
-
-    """
     print(submodule.code)
-    """
-    def forward(self, add):
-        mul = add * 2;  add = None
-        clamp = mul.clamp(min = 0.0, max = 1.0);  mul = None
-        return (clamp,)
-    """
-
-    return WrapperModule(submodule)
+    return WrapperModule(submodule, seq_no)
 
 
 class TestExtractorSubmodule(unittest.TestCase):
@@ -87,9 +73,10 @@ def forward(self, x):
 
         folded = fold_range_to_submodule(
             symbolic_traced,
-            start_node_idx=2,
-            end_node_idx=4,
+            start_node_idx=0,
+            end_node_idx=2,
             submodule_hook=submodule_hook,
+            # group_head_and_tail=False,
         )
         folded_output = folded(inp)